/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* loopsched.c -- OMPi RunTime library; loop scheduling */

#include "ort_prive.h"
#include <stdlib.h>
#include <stdarg.h>
#include <limits.h>
#ifdef OMPI_XTRA_LOOPSCHEDS
  #include <math.h>
#endif
#include "stddefs.h"
#include "assorted.h"

//#define DEBUG_LOOPS
#ifdef DEBUG_LOOPS
	#define dbg(method, size, thread_id) fprintf(stderr, \
	" [loopsched.c] %s: Chunk of size %d assigned to thread number %d\n",\
	method, size, thread_id);
#else
	#define dbg(method, size, thread_id)  
#endif

#define MYLOOP(me) ( &(my_wsregion(me)->forloop) )

#define SCHED_DATA(me, stack) \
	( ((stack) == NULL || (stack)->xsched.schedule == 0) ? \
	  &(__CURRTASK(me)->icvs.xsched) : &((stack)->xsched) )

#define PARENT_SCHED_DATA(me) ( &(__CURRTASK(me->parent)->icvs.xsched) )

/* Checks a) for active cancellation in order to terminate iteration 
 * assignment in the current team and b) for a team of 1 thread.
 * Used in all but the static schedules.
 */
#define CANCELATION_OR_SINGLE_THREAD \
	if (me->parent != NULL && TEAMINFO(me)->cancel_for_active) return (0); \
	if (me->num_siblings == 1) { \
		if (me->nowaitregion == 0) return 0; \
		*fiter = 0; *liter = niters; \
		me->nowaitregion = 0; \
		return (1); \
	}

#define SKIP_LOOP_CHECK \
	if (*iter >= niters) { \
		SKIPLOOP: \
			*fiter = *liter = niters + 1; \
			return 0; \
	}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * FOR SCHEDULES (dynamic, guided, static and runtime)               *
 *                                                                   *
 * OMPi normallizes all loops and uses unsigned long ints to count   *
 * the number of iterations.                                         *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/*
 * The inner workings of what follows is documented seperately in
 * the OMPi docs/ directory.
 */


int _ort_get_dynamic_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, void *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;

	if (chunksize == 0) ort_error(1, "fatal: dynamic chunksize 0 requested!\n");

	CANCELATION_OR_SINGLE_THREAD
	
	/* iter shall hold the next iter to give away */
	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

#if defined(HAVE_ATOMIC_FAA) && !defined(EE_TYPE_PROCESS)
	*fiter = _faaul(iter, chunksize);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		(*iter) += chunksize;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
		goto SKIPLOOP;

	*liter = *fiter + chunksize;
	if (*liter > niters)
		*liter = niters;
	dbg("Dynamic", *liter-*fiter, me->thread_num);
	return (1);
}


/* SUN suggests dividing the number of remaining iters by 2.
 */
int _ort_get_guided_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, void *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long            ch;

	if (chunksize == 0) ort_error(1, "fatal: guided chunksize 0 requested!\n");

	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

#if defined(HAVE_ATOMIC_CAS) && !defined(EE_TYPE_PROCESS)
	do
	{
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			ch = (ch + me->num_siblings - 1) / me->num_siblings;
			if (ch < chunksize)
				ch = chunksize;
		}
	}
	while (!_casul(iter, (*fiter), (u_long) ((*fiter) + ch)));
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			ch = (ch + me->num_siblings - 1) / me->num_siblings;
			if (ch < chunksize)
				ch = chunksize;
		}
		(*iter) += ch;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
	  goto SKIPLOOP;

	*liter = *fiter + ch;
	dbg("Guided", *liter-*fiter, me->thread_num)
	return (ch != 0);
}


/* Return the sole chunk a thread gets assigned
 */
int _ort_get_static_default_chunk(u_long niters, u_long *fiter, u_long *liter)
{
	ort_eecb_t *me = __MYCB;
	int        N = me->num_siblings, myid = me->thread_num;
	u_long     chunksize;

	if (N == 1)
	{
		*fiter = 0;
		*liter = niters;
		return (*fiter != *liter);
	}
	if (niters <= N)    /* less iterations than threads */
	{
		*fiter = myid;
		*liter = (myid < niters) ? myid + 1 : myid;
		return (*fiter != *liter);
	}

	chunksize = niters / N;
	niters = niters % N;
	if (niters) chunksize++;     /* first niters threads get this chunksize */

	if (myid < niters || niters == 0)       /* I get a full chunk */
	{
		*fiter = myid * chunksize;
		*liter = *fiter + chunksize;
	}
	else                                  /* I get a smaller chunk */
	{
		*fiter = niters * chunksize + (myid - niters) * (chunksize - 1);
		*liter = *fiter + (chunksize - 1);
	}
	return (*fiter != *liter);
}


/* Runtime version of the static schedule (suboptimal but unavoidable).
 * chunkid MUST be initialy equal to 0.
 */
int _ort_get_runtimestatic_chunk(u_long niters, u_long chunksize, int monotonic,
                                u_long *fiter, u_long *liter, void *extra)
{
	ort_eecb_t *me = __MYCB;
	int* chunkid = (int*) extra;

	if (me->num_siblings == 1)
	{
		if (*chunkid >= 0) { *fiter = niters + 1; return (0); } /* Only 1 chunk */
		*chunkid = 1;
		*fiter = 0;                    /* Get just 1 chunk: all iterations */
		*liter = niters;
		return (1);
	}

	if (chunksize == 0)  /* No chunksize given */
	{
		if (*chunkid == 1) { *fiter = niters + 1; return (0); } /* Only 1 chunk */
		*chunkid = 1;
		return ( _ort_get_static_default_chunk(niters, fiter, liter) );
	}
	else                 /* chunksize given */
	{
		if (chunksize == 0) ort_error(1, "fatal: runtime chunksize is 0\n");
		if (*chunkid < 0)    /* my very first chunk */
			*chunkid = me->thread_num;
		else
			(*chunkid) += me->num_siblings;
		*fiter = chunksize * (*chunkid);
		if (*fiter >= niters)
			return (0);
		*liter = *fiter + chunksize;
		if (*liter > niters)
			*liter = niters;

		return (1);
	}
}


#ifdef OMPI_XTRA_LOOPSCHEDS

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 *   EXTRA LOOP SCHEDULES:                                           *
 *   trapezoid, taper, fsc and factorial                             *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


int _ort_get_trapezoid_chunk(u_long niters, u_long chunksize, int monotonic,
                            u_long *fiter, u_long *liter, void *extra)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long            ch;

	xsched_data_t   *sched_data = (xsched_data_t*) extra;

	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

#if defined(HAVE_ATOMIC_FAA) && !defined(EE_TYPE_PROCESS)
	ch = _faa(&(MYLOOP(me)->xtra_int), sched_data->mean);
	if (ch < (long) chunksize)
		ch = chunksize;
	*fiter = _faaul(iter, ch);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		ch = MYLOOP(me)->xtra_int;
		MYLOOP(me)->xtra_int += sched_data->mean	;
		if (ch < (long) chunksize)
			ch = chunksize;
		(*iter) += ch;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
	  goto SKIPLOOP;

	*liter = *fiter + ch;
	if (*liter>niters)
		*liter = niters;
	dbg("Trapezoid", *liter-*fiter, me->thread_num);
	return (ch != 0);
}

#define _ort_get_fixed_size_chunk  _ort_get_dynamic_chunk

int _ort_get_taper_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, void *extra)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long            ch;

	xsched_data_t   *sched_data = (xsched_data_t*) extra;
	
	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

	double u = sched_data->alpha * sched_data->sigma / sched_data->mean;
	if (u<0) u = 0;
	double usq_halved = (u * u)/2;

#if defined(HAVE_ATOMIC_CAS) && !defined(EE_TYPE_PROCESS)
	do
	{
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			double T = (niters - *fiter) / me->num_siblings;
			ch =  ceil(T + usq_halved - u*sqrt(2*T + usq_halved/2));
			if (ch < (long) chunksize)
				ch = chunksize;
		}
	}
	while (!_casul(iter, (*fiter), (u_long) ((*fiter) + ch)));
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		ch = niters - *fiter;
		if (ch > chunksize)
		{
			double T = (niters - *fiter) / me->num_siblings;
			ch =  ceil(T + usq_halved - u*sqrt(2*T + usq_halved/2));
			if (ch < (long) chunksize)
				ch = chunksize;
		}
		(*iter) += ch;
		ee_unset_lock(lock);
	}
#endif
	if (*fiter >= niters)   /* double check; races may lead us here... */
	  goto SKIPLOOP;

	*liter = *fiter + ch;

	dbg("Taper", *liter-*fiter, me->thread_num);
	return (ch != 0);
}


int _ort_get_factoring_chunk(u_long niters, u_long chunksize, int monotonic,
                     u_long *fiter, u_long *liter, void *extra)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long int        ch;

	xsched_data_t   *sched_data = (xsched_data_t*) extra;
	
	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

	ee_lock_t *lock = &(my_wsregion(me)->reglock);

	ee_set_lock(lock);
	*fiter = *iter;
	if (MYLOOP(me)->xtra_int > 1) 
		MYLOOP(me)->xtra_int -= 1;
	else 
	{
		MYLOOP(me)->xtra_int = me->num_siblings;
		double b = (me->num_siblings * sched_data->sigma) / 
		               (2 * sqrt(niters - *fiter) * sched_data->mean);
		double x = 2 + b*b + b*sqrt(b*b+4);
		double denom = x*me->num_siblings;
		sched_data->first_chunk = (niters - *fiter + denom-1) / denom;
		if (sched_data->first_chunk < 1) sched_data->first_chunk=1;
		PARENT_SCHED_DATA(me)->first_chunk = sched_data->first_chunk;
	}
	ch = sched_data->first_chunk;
	(*iter) += ch;
	ee_unset_lock(lock);


	if (*fiter >= niters)   /* double check; races may lead us here... */
		goto SKIPLOOP;

	*liter = *fiter + ch;

	if (*liter > niters)
		*liter = niters;
	dbg("Factoring", *liter-*fiter, me->thread_num);
	return (1);
}

int _ort_get_factoring2_chunk(u_long niters, u_long chunksize, int monotonic,
                            u_long *fiter, u_long *liter, void *extra)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	long            ch;

	CANCELATION_OR_SINGLE_THREAD

	iter = &( MYLOOP(me)->iter );
	
	SKIP_LOOP_CHECK

#if defined(HAVE_ATOMIC_FAA) && !defined(EE_TYPE_PROCESS)
	int chunk_ind = _faa(&(MYLOOP(me)->xtra_int), 1);
	ch = (pow(0.5,chunk_ind / me->num_siblings + 1) * niters + me->num_siblings-1) / me->num_siblings;
	*fiter = _faaul(iter, ch);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		ch = (pow(0.5,MYLOOP(me)->xtra_int / me->num_siblings + 1) * niters + me->num_siblings-1) / me->num_siblings;
		MYLOOP(me)->xtra_int += 1;
		(*iter) += ch;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
	  goto SKIPLOOP;

	*liter = *fiter + ch;
	if (*liter>niters)
		*liter = niters;
	dbg("Factoring2", *liter-*fiter, me->thread_num);
	return (ch != 0);
}


int _ort_get_profiling_chunk(u_long niters, u_long chunksize, int monotonic,
                            u_long *fiter, u_long *liter, void *ignored)
{
	ort_eecb_t      *me = __MYCB;
	volatile u_long *iter;
	xsched_data_t	  *sched_data = PARENT_SCHED_DATA(me);
	ee_lock_t *lock = &(my_wsregion(me)->reglock);

	long time = omp_get_wtime()*1e6;
	if (sched_data->time_table == NULL){
		ee_set_lock(lock);
		if (sched_data->time_table == NULL){ //double check for threads waiting 
			sched_data->time_table = smalloc(niters * sizeof(long));
		}
		ee_unset_lock(lock);
	}
	u_long ind;
	if (__CURRTASK(me)->icvs.xsched.chunksize != 0){							
			ee_set_lock(lock);
			ind = MYLOOP(me)->xtra_int++;
			sched_data->time_table[ind] = time - __CURRTASK(me)->icvs.xsched.chunksize;
			ee_unset_lock(lock);
	}

	CANCELATION_OR_SINGLE_THREAD

	/* iter shall hold the next iter to give away */
	iter = &( MYLOOP(me)->iter );

	if (*iter >= niters) {
		__CURRTASK(me)->icvs.xsched.chunksize = 0;
		SKIPLOOP:
			ee_set_lock(lock);
			if (MYLOOP(me)->xtra_int != 0)
			{
				long i;
				long long sum = 0;
				long n = (MYLOOP(me)->xtra_int);
				if (n != niters) { // Check if all threads have finished their iters
					ee_unset_lock(lock);
					goto SKIPLOOP;
				}
				for (i=0; i<n; ++i){
					sum += sched_data->time_table[i];
				}
				int mean = sum/n;
				sum = 0;
				for (i=0; i<n; ++i){
					sum += (sched_data->time_table[i] - mean)*1LL*
					       (sched_data->time_table[i] - mean);
				}
				printf("Mean = %d, standard deviation = %lf, n = %ld\n", 
				       mean, sqrt(sum/n), n);
				MYLOOP(me)->xtra_int=0;
				free(sched_data->time_table);
				sched_data->time_table = NULL;
			}
			ee_unset_lock(lock);
			*fiter = *liter = niters + 1;
			return 0; 
	}

#if defined(HAVE_ATOMIC_FAA) && !defined(EE_TYPE_PROCESS)
	*fiter = _faa(iter, 1);
#else
	{
		ee_lock_t *lock = &(my_wsregion(me)->reglock);
						
		ee_set_lock(lock);
		*fiter = *iter;
		(*iter) += 1;
		ee_unset_lock(lock);
	}
#endif

	if (*fiter >= niters)   /* double check; races may lead us here... */
		goto SKIPLOOP;

	*liter = *fiter + 1;
	if (*liter > niters)
		*liter = niters;
	
	__CURRTASK(me)->icvs.xsched.chunksize = omp_get_wtime()*1e6;
	return (1);
}

#endif


void _ort_get_auto_schedule_stuff(chunky_t *func, u_long *chunksize)
{
	ort_eecb_t    *me = __MYCB;
	xsched_data_t *sched_data;
#ifdef OMPI_OMP_EXT
	tag_t* tag_stack = __CURRTASK(me)->tag_stack;
	sched_data = SCHED_DATA(me, tag_stack);
#else
	sched_data = &(__CURRTASK(me)->icvs.xsched);
#endif

	*chunksize = sched_data->chunksize;
	switch (sched_data -> schedule)
	{
#ifdef OMPI_XTRA_LOOPSCHEDS
		case omp_sched_trapezoid:
			*func = _ort_get_trapezoid_chunk;
			break;
		case omp_sched_taper:
			*func = _ort_get_taper_chunk;
			break;
		case omp_sched_fsc:
			*func = _ort_get_fixed_size_chunk;
			break;
		case omp_sched_factoring:
			*func = _ort_get_factoring_chunk;
			break;
		case omp_sched_factoring2:
			*func = _ort_get_factoring2_chunk;
			break;
		case omp_sched_profiling:
			*func = _ort_get_profiling_chunk;
			break;
#endif
		case omp_sched_dynamic:
			*func = _ort_get_dynamic_chunk;
			break;
		case omp_sched_guided:
			*func = _ort_get_guided_chunk;
			break;
		default:
			*func = _ort_get_runtimestatic_chunk;
			break;
	}
}


/* This returns the required function & chunksize to support the
 * RUNTIME schedule code.
 */
void _ort_get_runtime_schedule_stuff(chunky_t *func, u_long *chunksize)
{
	ort_eecb_t *me = __MYCB;
	
	*chunksize = __CURRTASK(me)->icvs.rtchunk;  /* -1 if not given */
	switch (__CURRTASK(me)->icvs.rtschedule)
	{
		case omp_sched_dynamic:
			*func = _ort_get_dynamic_chunk;
			if (*chunksize == 0) *chunksize = 1;
			break;
		case omp_sched_guided:
			*func = _ort_get_guided_chunk;
			if (*chunksize == 0) *chunksize = 1;
			break;
		case omp_sched_auto:
			_ort_get_auto_schedule_stuff(func, chunksize);
			break;
#ifdef OMPI_XTRA_LOOPSCHEDS
			case omp_sched_trapezoid:
				*func =  _ort_get_trapezoid_chunk;
				break;
			case omp_sched_taper:
				*func = _ort_get_taper_chunk;
				break;
			case omp_sched_fsc:
				*func = _ort_get_fixed_size_chunk;
				break;
			case omp_sched_factoring:
				*func = _ort_get_factoring_chunk;
				break;
			case omp_sched_factoring2:
				*func = _ort_get_factoring2_chunk;
				break;
#endif
		default:
			*func = _ort_get_runtimestatic_chunk;
			break;
	}
}

#ifdef OMPI_OMP_EXT
void _ort_read_tag_sched() {
	ort_eecb_t *me = __MYCB;
	tag_t* tag_stack = __CURRTASK(me)->tag_stack;

	char* env_name = ort_alloc(strlen(tag_stack->tag_string) + 15 + 1);
	
	strcpy(env_name, "OMPI_TAG_SCHED_");   /* length = 15 (above) */
	strcat(env_name, tag_stack->tag_string);
	env_read_auto_schedule(env_name, &tag_stack->xsched);
}

void _ort_read_tag_num_threads() {
	ort_eecb_t *me = __MYCB;
	tag_t* tag_stack = __CURRTASK(me)->tag_stack;

	char* env_name = ort_alloc(strlen(tag_stack->tag_string)+ 21 + 1);

	strcpy(env_name, "OMPI_TAG_NUM_THREADS_");
	strcat(env_name, tag_stack->tag_string);
	env_read_tag_threads(env_name, tag_stack->nthreads, &tag_stack->set_nthrlevs_tag);
}


void _ort_push_tag(char* tag, int has_number, int number) {
	ort_eecb_t *me = __MYCB;
	char* formed_tag;

	if (has_number){
		char numb[11];
		int len = strlen(tag);

		snprintf(numb, 11, "%d", number);
		formed_tag = smalloc(len+strlen(numb));

		strcpy(formed_tag,tag);
		strcat(formed_tag, numb);
	}
	else 
		formed_tag = tag;

	tag_t *new_tag = smalloc(sizeof(tag_t));
	
	new_tag->xsched.schedule = 0;
	new_tag->tag_string = formed_tag;
	new_tag->next_tag = __CURRTASK(me)->tag_stack;
	__CURRTASK(me)->tag_stack = new_tag;
}


void _ort_pop_tag() {
	ort_eecb_t *me = __MYCB;

	tag_t *deleted = __CURRTASK(me)->tag_stack;
	__CURRTASK(me)->tag_stack = deleted->next_tag;
	free(deleted);
}
#endif


void _ort_loop_initializations(unsigned long niters, u_long* chunksize, void** sched_data_loc, int isruntime)
{
#if defined(OMPI_OMP_EXT) || defined(OMPI_XTRA_LOOPSCHEDS)
	ort_eecb_t    *me = __MYCB;
	xsched_data_t *sched_data;
	if (!isruntime) {
		tag_t* tag_stack = __CURRTASK(me)->tag_stack;
		sched_data = SCHED_DATA(me, tag_stack);
	} else
	sched_data = &(__CURRTASK(me)->icvs.xsched);
#endif

#ifdef OMPI_XTRA_LOOPSCHEDS
	switch ( isruntime ? __CURRTASK(me)->icvs.rtschedule : sched_data -> schedule)
	{
		case omp_sched_auto: // can only happen when isruntime is 1, Init according to auto
			_ort_loop_initializations(niters, chunksize, sched_data_loc, 0);
			return;
		case omp_sched_static:
			return; // avoid changing extra from chunk index to schedule data
		case omp_sched_trapezoid:
		{
			if (sched_data->first_chunk == 0) // Default value
				sched_data->first_chunk = niters/(2 * me->num_siblings);
			if (sched_data->first_chunk < *chunksize) // First < Last
				sched_data->first_chunk = *chunksize;
			int alpha = (2*niters + (sched_data->first_chunk + *chunksize - 1 )) /
						(sched_data->first_chunk + *chunksize);
			if (alpha<2) alpha = 2;
			sched_data->mean = (sched_data->first_chunk - *chunksize) / (alpha - 1);
			sched_data->mean *=-1;
			#if defined(HAVE_ATOMIC_CAS) && !defined(EE_TYPE_PROCESS)
				_cas(&(MYLOOP(me)->xtra_int), 0, sched_data->first_chunk);
			#else
				ee_lock_t *lock = &(my_wsregion(me)->reglock);
				ee_set_lock(lock);
				if (MYLOOP(me)->xtra_int == 0)
					MYLOOP(me)->xtra_int =  sched_data->first_chunk;
				ee_unset_lock(lock);
			#endif
			break;
		}
		case omp_sched_fsc:
			*chunksize = pow((sqrt(2) * niters * sched_data->first_chunk) / 
	                     (sched_data->sigma * me->num_siblings * 
	                      sqrt(log(me->num_siblings)/log(10))), 2.0f/3);
			if (*chunksize < 1) *chunksize = 1;
			break;
		case omp_sched_factoring:
		{
			#if !defined(HAVE_ATOMIC_CAS) || defined(EE_TYPE_PROCESS)
				ee_lock_t *lock = &(my_wsregion(me)->reglock);		
				ee_set_lock(lock);
				if (MYLOOP(me)->xtra_int == 0) {
					MYLOOP(me)->xtra_int = me->num_siblings+1;
			#else
				if (_cas(&(MYLOOP(me)->xtra_int), 0, me->num_siblings+1)){
			#endif
					double b = (me->num_siblings * sched_data->sigma) / 
						(2 * sqrt(niters) * sched_data->mean);
					double x = 1 + b*b + b*sqrt(b*b+2);
					double denom = x*me->num_siblings;
					sched_data->first_chunk = (niters + denom-1) / denom;
					if (sched_data->first_chunk < 1) sched_data->first_chunk=1;
					PARENT_SCHED_DATA(me)->first_chunk = sched_data->first_chunk;
				}
			#if !defined(HAVE_ATOMIC_CAS) || defined(EE_TYPE_PROCESS)
				ee_unset_lock(lock);
			#endif
			break;
		}
		case omp_sched_profiling:
		{
			__CURRTASK(me)->icvs.xsched.chunksize = 0;
		}
			break;
		default:
			break;
	}
	*sched_data_loc = sched_data;
#endif
}

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * DISTRIBUTE SCHEDULES                                              *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


int _ort_get_distribute_chunk(u_long niters, u_long *fiter, u_long *liter) 
{
	ort_eecb_t *me = __MYCB;
	int        N = ort->league.numteams;
	int        myid = me->cgid;
	u_long     chunksize;

	if (N == 1)
	{
		*fiter = 0;
		*liter = niters;
		return (*fiter != *liter);
	}
	if (niters <= N)    /* less iterations than threads */
	{
		*fiter = myid;
		*liter = (myid < niters) ? myid + 1 : myid;
		return (*fiter != *liter);
	}

	chunksize = niters / N;
	niters = niters % N;
	if (niters) chunksize++;     /* first niters threads get this chunksize */

	if (myid < niters || niters == 0)       /* I get a full chunk */
	{
		*fiter = myid * chunksize;
		*liter = *fiter + chunksize;
	}
	else                                  /* I get a smaller chunk */
	{
		*fiter = niters * chunksize + (myid - niters) * (chunksize - 1);
		*liter = *fiter + (chunksize - 1);
	}
	return (*fiter != *liter);
}
