/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* tasks.c -- OMPi RunTime library: tasking */

/*
 * 2023/5:
 *   taskscopes
 * 2021/12/17:
 *   major refactoring
 * [...]
 * 2010/11/20:
 *   added calls for getting & setting task icvs..
 * Version 1.0.1j:
 *   first time around, out of ort.c code.
 */

//#define DBGPRN_FORCE 
//#define DBGPRN_BLOCK
#define DBGPRN_FILTER DBG_TASKS

static void task_stats(void);

#include "ort_prive.h"
#include <stdlib.h>
#include <stdio.h>


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * GLOBAL VARIABLES / DEFINITIONS / MACROS                           *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

/* Per-thread task throttling */
#ifdef USE_TLS
	TLS_KEYWORD int throttle;
	#define __start_throttling() (throttle = 1)
	#define __stop_throttling()  (throttle = 0)
	#define __check_throttling() (throttle + 0)
#else
	ee_key_t throttle_key;    /* For thread-specific task throttling */
	#define __start_throttling() ee_setspecific(throttle_key,(void *) 1)
	#define __stop_throttling()  ee_setspecific(throttle_key,(void *) 0)
	#if !defined(AVOID_OMPI_DEFAULT_TASKS)
		#define __check_throttling() (0 != (uintptr_t) ee_getspecific(throttle_key))
	#else
		#define __check_throttling() (ee_check_throttling())
	#endif
#endif

/* ATOMIC_ADD uses faa if possible, otherwise the given lock */
#if defined(HAVE_ATOMIC_FAA)
	#define ATOMIC_ADD(ptr,incr,ignore) _faa(ptr, incr)
#else
	#define ATOMIC_ADD(ptr,incr,lock) \
	        { ee_set_lock(&(lock)); (*(ptr))+=(incr); ee_unset_lock(&(lock)); }
#endif

#if defined(HAVE_ATOMIC_FAA) && defined(HAVE_ATOMIC_CAS)
	#define USE_ATOMICS 
#endif

static void start_throttling_due_to_full_queue(void);


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * QUEUES AND STEALING                                               *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* Allocate memory for my group task queues */
inline void taskqueues_init(ort_eecb_t *me, int nthr)
{
#if !defined(AVOID_OMPI_DEFAULT_TASKS)
	int i;

	/* Thread becomes parent for first time */
	if (me->mf->queues == NULL)
	{
		me->mf->queues = (ort_task_queue_t *)
		                          ort_calloc((nthr + 1) * sizeof(ort_task_queue_t));
		for (i = 0; i < nthr + 1; i++) /* Initialize task queues */
		{
			me->mf->queues[i].top = 0;
			me->mf->queues[i].bottom = 0;
			me->mf->queues[i].implicit_task_children = NULL;
			me->mf->queues[i].tasks = (ort_task_node_t **)
			                    ort_calloc(TASKQUEUESIZE * sizeof(ort_task_node_t *));
#ifndef USE_ATOMICS
			ee_init_lock((ee_lock_t *) & (me->mf->queues[i].lock), ORT_LOCK_NORMAL);
#endif
		}
		me->mf->numqueues = nthr + 1;
		me->mf->nonidle = nthr+1;
	}
	else
	{
		if (me->mf->numqueues < nthr + 1)  /* realloc needed */
		{
			for (i = 0; i < me->mf->numqueues; i++) /* free queues */
				free(me->mf->queues[i].tasks);

			/* Reallocate queues table */
			me->mf->queues = (ort_task_queue_t *)
			   realloc(me->mf->queues, (nthr+1)*sizeof(ort_task_queue_t));
			for (i = 0; i < nthr + 1; i++)
				me->mf->queues[i].tasks = (ort_task_node_t **)
				                  ort_calloc(TASKQUEUESIZE * sizeof(ort_task_node_t *));
#ifndef USE_ATOMICS
			for (i = me->mf->numqueues; i < nthr + 1; i++)
				ee_init_lock((ee_lock_t *) & (me->mf->queues[i].lock), ORT_LOCK_NORMAL);
#endif
			me->mf->numqueues = nthr + 1;
		}
		me->mf->nonidle = nthr+1;
		/* Reinitialize queue table elements */
		for (i = 0; i < nthr + 1; i++)
		{
			me->mf->queues[i].top = 0;
			me->mf->queues[i].bottom = 0;
			me->mf->queues[i].implicit_task_children = NULL;
		}
	}
#endif
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)

/**
 * @brief Add a new tasknode in my queue; task nodes are added at the bottom
 *        of the queue.
 * 
 * This function is called by a thread (worker) in order to enqueue a new 
 * task_node to its own task queue. It has been guaranteed that there is
 * indeed space in the queue.
 */
inline void ort_enqueue_task(ort_eecb_t *me, ort_task_node_t *tnode)
{
	ort_task_queue_t *Q = &(TEAMINFO(me)->queues[me->thread_num]);
	int old_bottom = atomic_read(&(Q->bottom));

	if (tnode->parent != NULL)  /* Add a new child to parent task */
		ATOMIC_ADD(&((tnode->parent)->num_children), 1, ((tnode->parent)->lock));
	Q->tasks[old_bottom % TASKQUEUESIZE] = tnode;
	Q->bottom++;
	NON_TSO_FENCE;

#if ORT_DEBUG & DBG_TASKS
	me->tasking.tasks_enqueued++;
#endif
}


/**
 * Extract a task from my own queue; task nodes are removed from the bottom
 * of the queue (LIFO).
 * @param me my eecb
 * @return the dequeued task node to execute or NULL if queue was empty
 */
static inline ort_task_node_t *dequeue_task_lifo(ort_eecb_t *me)
{
	int old_top, new_top, size, thrid = me->thread_num;
	ort_task_queue_t *Q = &(TEAMINFO(me)->queues[thrid]);
	ort_task_node_t *extracting_node;

	/* Make a first fast check */
	size = atomic_read(&(Q->bottom)) - atomic_read(&(Q->top)) - 1;

	/* If my queue is almost full, it is safe to enter throttle mode */
	if (size > (int)(TASKQUEUESIZE * 0.7))
		start_throttling_due_to_full_queue();

	if (size < 0) /* Queue is empty */
		return NULL;

	ATOMIC_ADD(&(Q->bottom), -1, Q->lock);

	old_top = atomic_read(&(Q->top));
	new_top = old_top + 1;
	size = atomic_read(&(Q->bottom)) - old_top;
	if (size < 0) /* Queue is empty */
	{
		Q->bottom = old_top;
		NON_TSO_FENCE;
		return NULL;
	}

	extracting_node = Q->tasks[atomic_read(&(Q->bottom)) % TASKQUEUESIZE];
	if (size > 0)
		return extracting_node;

	/* If there is only one task left in queue... */
#ifdef USE_ATOMICS
	if (!_cas(&(Q->top), old_top, new_top)) /* If a thief stole the last task.. */
		extracting_node = NULL;/* then return NULL, else return the last task */
#else
	ee_set_lock(&(Q->lock));
	if (Q->top == old_top)
		Q->top = new_top;
	else
		extracting_node = NULL;
	ee_unset_lock(&(Q->lock));
#endif

	/* old_top + 1 = new_top */
	Q->bottom = old_top + 1;
	return extracting_node;
}


#define dequeue_task_fifo(me) steal_from(me, (me)->thread_num)


/**
 * Try to steal a task from a given other thread; stealing occurs from the
 * top of a queue (FIFO).
 * @param me my eecb
 * @param victim_id the id of the target thread (to steal from)
 * @return the stolen task node to execute or NULL if nothing was stolen.
 */
static inline ort_task_node_t *steal_from(ort_eecb_t *me, int victim_id)
{
	ort_task_queue_t *Q = &(TEAMINFO(me)->queues[victim_id]);
	int old_top = atomic_read(&(Q->top));
	int new_top = old_top + 1;
	int old_bottom = atomic_read(&(Q->bottom));
	int size = old_bottom - old_top;
	ort_task_node_t *extracting_node;

	/* If my queue is almost full, it is safe to enter throttle mode */
	if (me->thread_num == victim_id && size > (int)(TASKQUEUESIZE * 0.7))
		start_throttling_due_to_full_queue();

	if (size <= 0) /* Victim's queue is empty */
		return NULL;
		
	/* Steal a task from vitim's top; increase non-idle count optimistically */
	if (me->thread_num != victim_id && me->tasking.status == TIDLE)
		ATOMIC_ADD(&(TEAMINFO(me)->nonidle), +1, __CURRTASK(me)->parent->lock);
	extracting_node = Q->tasks[old_top % TASKQUEUESIZE];

	/* if I managed to steal a task... */
#ifdef USE_ATOMICS
	if (_cas(&(Q->top), old_top, new_top))
		return extracting_node;
#else
	ee_set_lock(&(Q->lock));
	if (Q->top == old_top)
	{
		Q->top = new_top;
		ee_unset_lock(&(Q->lock));
		return extracting_node;
	}
	ee_unset_lock(&(Q->lock));
#endif

	/* Did not manage to steal after all; restore non-idle count */
	if (me->thread_num != victim_id && me->tasking.status == TIDLE)
		ATOMIC_ADD(&(TEAMINFO(me)->nonidle), -1, __CURRTASK(me)->parent->lock);
	return NULL;
}


/**
 * When a thread calls this function, it tries to steal any task from 
 * any queue, starting its efforts from a given victim.
 * @param me my eecb
 * @param startfrom the victim thread/queue to start stealing efforts from;
 *                  if -1, start anywhere
 * @param stolefrom if non-NULL, it will get the victim id (or -1 if no
 *                  victim was found)
 * @return the stolen task node to execute or NULL if nothing was stolen.
 */
static 
ort_task_node_t *steal_any_task(ort_eecb_t *me, int startfrom, int *stolefrom)
{
	int victim, thrid = me->thread_num;
	int search_limit, teamsize = me->sdn->mf->num_children;
	ort_task_node_t *task_to_execute = NULL;

	if (startfrom >= 0)              /* Start stealing from there */
		search_limit = teamsize + startfrom;
	else
	{
		startfrom = thrid+1;
		search_limit = teamsize + thrid;
	}

	for (victim = startfrom; victim < search_limit; victim++)
	{
		if (victim == thrid)
			continue;

		task_to_execute = steal_from(me, victim % teamsize);
		if (task_to_execute == NULL) /* victim's queue empty; goto next victim */
		{
#if ORT_DEBUG & DBG_TASKS
			me->tasking.fail_theft_attemts++;
#endif
			continue;
		}

#if ORT_DEBUG & DBG_TASKS
		me->tasking.tasks_executed_by_thief++; /* will execute it */
#endif
		if (stolefrom)
			*stolefrom = victim % teamsize;
		return task_to_execute;
	}
	
	/* Nothing to steal */
	if (stolefrom)
		*stolefrom = -1;
	return NULL; /* There was no task left to execute */
}
#endif /* AVOID_OMPI_DEFAULT_TASKS */


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * TASKS                                                             *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


static void start_throttling_due_to_full_queue(void)
{
#if ORT_DEBUG & DBG_TASKS
	__MYCB->tasking.throttled_queue++;
	__MYCB->tasking.in_throttle++;
#endif
	__start_throttling();
}


void tasking_init()
{
#ifdef USE_TLS
#else
	ee_key_create(&throttle_key, 0);
#endif
	__stop_throttling();
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)

/* task_check_throttling
 * This function checks whether a thread's task queue
 * is 70% full of tasks. If not thread will stop throttling
 */
inline static void task_check_throttling(ort_eecb_t *me)
{
	int old_bottom, old_top;
	ort_task_queue_t *Q = &(TEAMINFO(me)->queues[me->thread_num]);

	/* Now check that i have enough space in Task Queue */
	old_bottom = atomic_read(&(Q->bottom));
	old_top = atomic_read(&(Q->top));

	/* If my queue is less than 70% full */
	if ((old_bottom - old_top) < (int)(TASKQUEUESIZE * 0.7))
	{
		__stop_throttling();
#if ORT_DEBUG & DBG_TASKS
		me->tasking.out_throttle++;
#endif
	}

	return;
}

#endif /* AVOID_OMPI_DEFAULT_TASKS */


void ort_create_task_immediate_node(ort_eecb_t *thr)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return;
#else
	ort_task_node_t *new_node;

	/* Create a task node in order to save task data */
	new_node = ort_task_alloc(NULL, NULL);

#ifndef USE_ATOMICS
	ee_init_lock((ee_lock_t *) & (new_node->lock), ORT_LOCK_NORMAL);
#endif
	new_node->func              = NULL;
	new_node->num_children      = 1; /* Ensures task is complete before freeing */
	new_node->next              = NULL;
	new_node->parent            = __CURRTASK(thr);
	new_node->icvs              = __CURRTASK(thr)->icvs;
	new_node->inherit_task_node = 0;
	new_node->isfinal           = __FINALTASK(thr);
	new_node->taskgroup         = __CURRTASK(thr)->taskgroup;
	new_node->taskscope         = __CURRTASK(thr)->taskscope;
	new_node->rtid              = 0;
	new_node->dependencies      = NULL;

	/* I have my own task node now, Reduce parent task's counter */
	__INHERITASK(thr) --;

	/* I have my own task node now, Reduce parent task's final counter */
	if (__FINALTASK(thr) > 0)
		__FINALTASK(thr)--;

	__CURRTASK(thr) = new_node;

	/* Check whether i have to stop throttling */
	if (thr->num_siblings != 1)
		task_check_throttling(thr);

	return;
#endif
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)

#define NP(X) (void*)(((char*)(X)) + SIZEOF_CHAR_P)  /* get next pointer */
#define PP(X) (((char*)(X)) - SIZEOF_CHAR_P)     /* get previous pointer */


/**
 * @brief Execute a given task and cleanup afterwards.
 * 
 * @param me my eecb
 * @param task_to_execute the task to execute
 */
void ort_task_execute_this(ort_eecb_t *me, ort_task_node_t *task_to_execute)
{
	ort_task_node_t *prev_task_to_execute;
	int             task_siblings, exec_task_children;

	if (task_to_execute->rtid == -1)  /* If i am to execute a normal task */
	{
		/* First update thread executing status... */
		prev_task_to_execute = __CURRTASK(me);
		__CURRTASK(me) = task_to_execute;

		/* Function arguments pointer is stored in next pointer */
		if (!me->sdn->mf->cancel_par_active)
		{
			if (__CURRTASK(me)->taskgroup == NULL)
				(task_to_execute->func)(NP(task_to_execute->funcarg));
			else
				if (__CURRTASK(me)->taskgroup->is_canceled == false)
					(task_to_execute->func)(NP(task_to_execute->funcarg));
		}

		/* OpenMP 4.0:
		 * If in a taskgroup, i have to execute my child tasks before returning
		 */
		if (task_to_execute->taskgroup != NULL)
			_ort_taskwait(0);

		/* Finished my job; update thread executing status */
		if (task_to_execute->parent->dependencies)
			tdeps_after_execution(task_to_execute, me);
		__CURRTASK(me) = prev_task_to_execute;            /* restore */
	}
	else /* A special parallel for task */
		spwtasks_execute_node(me, task_to_execute);

	if (task_to_execute->parent != NULL)   /* If task has a parent */
	{
#ifdef USE_ATOMICS
		task_siblings = _faa(&((task_to_execute->parent)->num_children), -1);
#else
		ee_set_lock(&((task_to_execute->parent)->lock));
		task_siblings = task_to_execute->parent->num_children;
		task_to_execute->parent->num_children--;
		ee_unset_lock(&((task_to_execute->parent)->lock));
#endif
		if (task_siblings == 1)
		{
			if (task_to_execute->parent->dependencies)
				tdeps_free_tdepinfo(task_to_execute->parent->dependencies);
			ort_task_free(me, task_to_execute->parent);
		}
	}

	/* This task is over, subtract the virtual child of this task */
#ifdef USE_ATOMICS
	exec_task_children = _faa(&(task_to_execute->num_children), -1);
#else
	ee_set_lock(&((task_to_execute)->lock));
	exec_task_children = task_to_execute->num_children;
	task_to_execute->num_children--;
	ee_unset_lock(&((task_to_execute)->lock));
#endif

	/* Free memory, if task has no children */
	if (exec_task_children == 1)
	{
		if (task_to_execute->dependencies)
			tdeps_free_tdepinfo(task_to_execute->dependencies);
		ort_task_free(me, task_to_execute);
	}
}

#endif /* AVOID_OMPI_DEFAULT_TASKS */


/* This function creates a new task node and sets this task as the
 * calling thread's current task. Called when a task is about to be
 * executed immediately.
 */
inline void *_ort_task_immediate_start(int final)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return (void *)ee_task_immediate_start(final);
#else
	ort_eecb_t *me = __MYCB;

#if ORT_DEBUG & DBG_TASKS
	me->tasking.throttled++;
#endif

	/* Speed up immediate task execution; I inherited task from my father */
	__INHERITASK(me) ++;

	/* Increase final counter for "final" information bookkeeping */
	if (__FINALTASK(me) > 0 || final > 0)
		__FINALTASK(me)++;

	/* Check whether i have to stop throttling */
	if (me->num_siblings != 1)
		task_check_throttling(me);

	return me;
#endif
}


inline void _ort_task_immediate_end(void *my_eecb)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	ee_task_immediate_end(my_eecb);
#else
	ort_eecb_t      *me = (ort_eecb_t *)my_eecb;
	ort_task_node_t *task_node;

	if (__INHERITASK(me) > 0)
	{
		/* I executed a final task immemdiately, information bookkeeping */
		if (__FINALTASK(me) > 0)
			__FINALTASK(me)--;

		/* I inherited task from my father, nothing to do */
		__INHERITASK(me) --;
		return;
	}

	task_node = __CURRTASK(me);
	__CURRTASK(me) = task_node->parent;    /* Restore task node */

	if (task_node->num_children == 1)
		ort_task_free(me, task_node);          /* Recycle task node */
#endif
}


/**
 * Create a new task.
 * Here it is assumed that the compiler has already issued a check for no
 * throttling. Since only me can put tasks in my queue, no throttling
 * guarantees that I do have space for a new task.
 * However, because of OpenMP v45 #target-related constructs, there is a
 * chance that this function gets called without a prior check for throttling.
 * Consequently, we go through a seemingly redunant throttling check.
 * This check also enables the implementation of a single (but slow) tasking
 * code generation, if the user so desires (yeah, sure...).
 * @param func:       the task function
 * @param arg:        the argument to the task function
 * @param now:        if true, execute the task immediately
 * @param final:      came with a final clause
 * @param untied:     came with an untied clause
 * @param deparray:   array with all dependences
 * @param noutdeps:   # out dependences
 * @param nindeps:    # in dependences
 * @param ninoutdeps: # inout dependences
 */
void _ort_new_task(void *(*func)(void *arg), void *arg,
                  int now, int final, int untied, int priority,
                  void **deparray, int noutdeps, int nindeps, int ninoutdeps)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	ee_new_task(final, untied, func, arg);
#else
	ort_eecb_t      *me = __MYCB;
	ort_task_node_t *tnode;
	int             mefinal = ( __FINALTASK(me) > 0 );

	/* enqueueing forces me to the busy state */
	me->tasking.status = TBUSY;
	
	if (priority < 0) priority = 0;
	if (priority > ort->icvs.max_task_prio) priority = ort->icvs.max_task_prio;

	/* If dependencies are present, follow another route (unless I am final) */
	if  (deparray && !mefinal)
	{
		tdeps_issue_task(ort_task_alloc_init(func, arg, final, -1, me),
		                 deparray, noutdeps, nindeps, ninoutdeps);
		return;
	}

	/* Check whether i am a final task or if i should throttle (VVD, OpenMP v45);
	 * if the latter test seems to slow things down, we should add a new
	 * parameter to _ort_new_task() to inform it whether the throttlilng check
	 * has already been performed.
	 */
	if (now || mefinal || _ort_task_throttling())
	{
#if ORT_DEBUG & DBG_TASKS
		if (mefinal)
			me->tasking.throttled_final++;
#endif

		/* If so then execute immediately; my children will also be final */
		tnode = _ort_task_immediate_start(mefinal);
		(*func)(arg);

		if (arg != NULL)
			ort_task_free(me, *((ort_task_node_t **)PP(arg)));
		_ort_task_immediate_end(tnode);

		return;
	}

	ort_enqueue_task(me, ort_task_alloc_init(func, arg, final, -1, me));
	testnotset(TEAMINFO(me)->at_least_one_task);      /* Inform my mates */
#endif
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)

/* This is called only from _ort_taskwait() */
static int execute_any_task(ort_eecb_t *me, int startfrom)
{
	ort_task_node_t *task = NULL;
	int victim = -1;

	/* start from my own queue */
	task = (ort->task_dequeue_policy == LIFO) ?
	                  dequeue_task_lifo(me) : dequeue_task_fifo(me);
	if (task)
	{
		victim = me->thread_num;
#if ORT_DEBUG & DBG_TASKS
		me->tasking.tasks_executed_by_worker++;
#endif
	}
	else   /* resort to stealing */
		task = steal_any_task(me, startfrom, &victim);
	
	if (task)
		ort_task_execute_this(me, task);
	return victim;
}


/* This is called only from _ort_taskwait() */
static int check_for_tasks(ort_eecb_t *me)
{
	int              teamsize = me->num_siblings, victim, retry = -1;
	ort_task_queue_t *Q = TEAMINFO(me)->queues;

	/* Search for # of unfinished tasks in my mates queues */
	for (victim = me->thread_num+1; victim < teamsize + me->thread_num; victim++)
	{
		NON_TSO_FENCE;
		if ((Q[victim % teamsize]).implicit_task_children != NULL &&
		    *((Q[victim % teamsize]).implicit_task_children) > 1)
			return (victim % teamsize);
	}

	return retry;
}

#endif


/* How = 0 (wait for my children),
 *       1 (wait for all team tasks),
 *       2 (wait at the end of parallel)
 *
 * NOTICE: taskwait(1) is now NEVER called!
 */
void _ort_taskwait(int how)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	ort_eecb_t *me = __MYCB;
	if (me->parent)
		ee_taskwait(how, me->parent->ee_info, me->thread_num);
	else
		ee_taskwait(how, NULL, me->thread_num);
#else
	ort_eecb_t *me = __MYCB;
	int        victim = -1;

	if (me->num_siblings == 1)
		return;
	if (how < 2 && TEAMINFO(me)->at_least_one_task == 0)
		return;
	
	if (how == 0) /* execute till all my child tasks finish */
	{
		while (__CURRTASK(me)->num_children > 1)
		{
			NON_TSO_FENCE;
			execute_any_task(me, victim);
		}
	}
	else
		if (how == 1)   /* help with all the tasks in current team */
		{
			EXECUTE_TEAM_TASKS:
			do
			{
				while ((victim = execute_any_task(me, victim)) != -1)
					;
			}
			while ((victim = check_for_tasks(me)) != -1);
		}
		else /* how = 2 */
		{
			if (__CURRTASK(me)->rtid == -1) /* Normal task lightweight barrier */
				default_barrier_wait_in_parallel(TEAMINFO(me)->barrier, me->thread_num);
			else /* Parallel for task lightweight barrier */
				goto EXECUTE_TEAM_TASKS;
		}
#endif
}


/**
 * Only execute tasks from my queue (i.e. do not attempt to steal)
 */
static void execute_my_tasks(ort_eecb_t *me)
{
	ort_task_node_t *tnode;

	for (;;)
	{
		tnode = (ort->task_dequeue_policy == LIFO) ? 
		                   dequeue_task_lifo(me) : dequeue_task_fifo(me);
		if (tnode == NULL)
			return; /* drained */
		ort_task_execute_this(me, tnode);
#if ORT_DEBUG & DBG_TASKS
		me->tasking.tasks_executed_by_worker++;
#endif
	}
}


#define return_if_cancelled(flag) if (CANCEL_ENABLED() && (flag)) return


/**
 * All threads enter this and cooperate until there is no task
 * remaining to execute. A consensus protocol is in effect to ensure
 * all tasks have finished. While it tries to be a self-contained function,
 * two other functions affect the protocol:
 *   _ort_new_task() forces a state changes to TBUSY -- there is no other
 *                  way since we cannot know when new tasks are enqueued
 *   steal_from()   increases the global nonidle counter if it manages to
 *                  steal from the idle state -- this is necessary so as to
 *                  avoid a race condition where a thread enqueues a last
 *                  task and an idle thread manages to steal it
 * 
 * @param me the eecb of the calling thread
 * 
 */
void finish_all_team_tasks(ort_eecb_t *me)
{
	int victim = -1;
	volatile int *nonidle = &(TEAMINFO(me)->nonidle),
	             *canpar = &(TEAMINFO(me)->cancel_par_active);
	ort_task_node_t *tnode;

	NON_TSO_FENCE;
	me->tasking.status = TBUSY;   /* start off as busy */
	
	STATE_TBUSY:
		return_if_cancelled(*canpar);
		execute_my_tasks(me);       /* start with my own tasks */
		me->tasking.status = TEMPTY;      /* my queue is now empty */
		/* fall througn to the empty state */
	
	STATE_TEMPTY:
		/* try to steal tasks; always start from the last victim */
		while ((tnode = steal_any_task(me, victim, &victim)) != NULL)
		{
			/* no state transition */
			ort_task_execute_this(me, tnode);
			return_if_cancelled(*canpar);
			if (me->tasking.status == TBUSY) /* i enqueued new tasks */
				goto STATE_TBUSY;
		};
			
		/* no more to steal, my queue is empty => i become idle */
		me->tasking.status = TIDLE;
		ATOMIC_ADD(nonidle, -1, __CURRTASK(me)->parent->lock);
#if ORT_DEBUG & DBG_TASKS
		me->tasking.empty2idle++;
#endif
		/* fall through to the idle state */
	
	STATE_IDLE:
		while (*nonidle)  /* try to steal a task until all threads are idle */
		{
			return_if_cancelled(*canpar);
			if ((tnode = steal_any_task(me, -1, &victim)) != NULL)
			{
				/* state transition; but don't update counter; steal_from() did it */
				me->tasking.status = TEMPTY;
#if ORT_DEBUG & DBG_TASKS
				me->tasking.idle2empty++;
#endif
				ort_task_execute_this(me, tnode);
				if (me->tasking.status == TBUSY) /* i enqueued new tasks */
					goto STATE_TBUSY;
				else
					goto STATE_TEMPTY;
			}
		}
		/* done*/
}


/* Task throttling.
 * For the moment, this is a per-thread flag that should be adjusted
 * adaptively.
 * A simple policy would be to __start_throttling() when the number of
 * tasks in my private queue exceeds c*N where c is a constant and N
 * is the number of processors. If later I discover that the number
 * fell below this threshold, I __stop_throttling().
 */
int _ort_task_throttling(void)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return __check_throttling();
#else

	ort_eecb_t *me = __MYCB;

	/* Check if i am already throttling */
	if (__check_throttling())
		return 1;
	/* Check if my team consists of one thread */
	if (me->num_siblings == 1)
	{
#if ORT_DEBUG & DBG_TASKS
		me->tasking.throttled_serial++;
#endif
		__start_throttling();
		return 1;
	}

	/* Check queue */
	{
		ort_task_queue_t *Q = &(TEAMINFO(me)->queues[me->thread_num]);
		int old_bottom = atomic_read( &(Q->bottom) );
		int old_top = atomic_read( &(Q->top) );

		if ((old_bottom - old_top) >= TASKQUEUESIZE) /* Q about to be full */
		{
#if ORT_DEBUG & DBG_TASKS
			me->tasking.throttled_queue++;
			me->tasking.in_throttle++;
#endif
			__start_throttling();
			return 1;
		}
		else
			return 0;  /* No reason to throttle */
	}

#endif
}


/* Only called from othr.c, when in nestable locks */
void *ort_get_current_task()
{
	return (void *) __CURRTASK(__MYCB);
}


void ort_start_implicit_task(ort_eecb_t *thr)
{
	ort_eecb_t      *parent_thread = thr->parent;
	ort_task_node_t *tnode, *parent_task;

#if defined(AVOID_OMPI_DEFAULT_TASKS)

	ee_start_implicit_task(&thr->ee_info, parent_thread->ee_info);
	return;

#else
	task_pools_init(thr);

	/* Check whether i use my own task node or an inherited one */
	if (__INHERITASK(parent_thread))
		ort_create_task_immediate_node(parent_thread);

	parent_task = __CURRTASK(parent_thread);

	tnode = ort_task_alloc(NULL, NULL);
	tnode->rtid         = -1; /* Not a parallel for task */
	tnode->func         = NULL;
	tnode->num_children = 1; /* To ensure a task is complete before freeing it */
	tnode->next         = NULL;
	tnode->parent       = parent_task;
	tnode->inherit_task_node = 0;
	tnode->icvs         = parent_task->icvs;
	tnode->isfinal      = 0;
	tnode->taskgroup    = NULL;
	tnode->taskscope    = parent_task->taskscope;  /* TODO: is this correct? */
	tnode->dependencies = NULL;
	
	/* OpenMP 3.1 */
	if (thr->activelevel != parent_thread->activelevel) /* i.e. not a team of 1 */
		if (thr->activelevel < ort->set_nthrlevs)  /* Use the user-supplied value */
			tnode->icvs.nthreads = ort->nthr_per_level[thr->activelevel];

	/* OpenMP 4.0 */
	if (thr->activelevel != parent_thread->activelevel) /* i.e. not a team of 1 */
		if (thr->activelevel-1 < ort->set_bindlevs)/* Use the user-supplied value */
			tnode->icvs.proc_bind = ort->bind_per_level[thr->activelevel-1];

	/* Save # of children in order to use it in barrier task wait */
	parent_thread->mf->queues[thr->thread_num].implicit_task_children
	  = &(tnode->num_children);

	ATOMIC_ADD(&(parent_task->num_children), +1, parent_task->lock);
	__SETCURRTASK(thr, tnode);
	__SETCURRIMPLTASK(thr, tnode);

#endif
}


void ort_finish_implicit_task(ort_eecb_t *thr)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return;
#else
	ort_task_node_t *tnode = __CURRTASK(thr);

	ATOMIC_ADD(&(tnode->parent->num_children), -1, tnode->parent->lock);

//  (VVD) this seems useless; the parallel barrier executed all tasks
//	if (thr->num_siblings > 1)   /* lightweight barrier: */
//		_ort_taskwait(1);           /* basicaly help with any tasks you can find */

#if ORT_DEBUG & DBG_TASKS
	{
		task_stats();
	}
#endif

	__SETCURRTASK(thr, tnode->parent);
	__SETCURRIMPLTASK(thr, tnode->parent);

	if (tnode->dependencies)
		tdeps_free_tdepinfo(tnode->dependencies);
	ort_task_free(thr, tnode);              /* recycle task node */
#endif
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * TASK REDUCTIONS AND SCOPES                                        *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* Assuming an original var X, each task uses a local LX and each thread 
 * uses a private PX to perform the reduction. For each task executed by a
 * thread, all task-local variables LX are reduced onto the thread's PX. 
 * Thus, at the end, all threads that executed at least 1 task, will be left 
 * with one PX each. These thread-private variables will be reduced by the 
 * taskgroup owner in a separate phase when time comes (e.g. end of 
 * taskgroup region).
 */

/* This is what the compiler privides */
typedef struct {
	void *origvar;   /* The original var */
	int  size;       /* Size in bytes */
	void (*combiner)(void*, void*, int size);   /* How to reduce */
} _compiler_trv_t;


/**
 * @brief The compiler calls this to create a new task reduction scope
 *        (i.e. specify the variables to be reduced)
 * 
 * @param num    the number of variables 
 * @param datarr a _compiler_trv_t array specifying the original reduction vars
 */
void _ort_taskscope_start(int num, void *datarr)
{
	_compiler_trv_t *v = (_compiler_trv_t *) datarr;
	ort_eecb_t *me = __MYCB;
	taskscope_t *ts = ort_alloc(sizeof(taskscope_t));
	int i;
	
	ts->parent = __CURRTASK(me)->taskscope;
	__CURRTASK(me)->taskscope = ts;
	ts->reds = (taskred_t *) ort_alloc(num * sizeof(taskred_t));
	ts->numreds = num;
	DBGPRN((stderr, "[task reduction scope %p] starting {\n", ts));
	for (i = 0; i < num; i++)
	{
		ts->reds[i].origvar = v[i].origvar;
		ts->reds[i].size = v[i].size;
		ts->reds[i].combiner = v[i].combiner;
		/* NULL pointers */
		ts->reds[i].thrvar = ort_calloc(me->num_siblings * sizeof(void *));
		DBGPRN((stderr, "  added var %p (%d bytes)\n", v[i].origvar, v[i].size));
	}
	DBGPRN((stderr, "}\n", ts));
}


/**
 * @brief The compiler calls this to reduce a variable. Notice that the
 *        the operator is not passed; the stored one will be used anyway.
 * 
 * @param origvar  The original variable
 * @param localvar The task-local variable
 */
void _ort_task_reduce(void *origvar, void *localvar)
{
	ort_eecb_t *me = __MYCB;
	taskscope_t *ts = __CURRTASK(me)->taskscope;
	taskred_t *tr;
	void **thrvar;
	int i;
	
	DBGPRN((stderr, "[task red. scope %p]: reducing var %p onto %p\n", 
	                ts, localvar, origvar));
	if (ts == NULL)
		return;
	
	/* walk the scope hierarchy to find it in registered reduction vars */
	for (; ts != NULL; ts = ts->parent)
	{
		for (i = 0; i < ts->numreds; i++)
			if (ts->reds[i].origvar == origvar)
				break;
		if (i < ts->numreds)
			break;
	}
	if (ts == NULL)
		ort_error(1, "unknown reduction variable (%p)\n", origvar);
		
	/* get my thead-private var; if NULL, initialize; else combine */
	tr = &ts->reds[i];
	if (tr->size <= 0)   /* Just in case it was a zlas */
		return;
	thrvar = &(tr->thrvar[me->thread_num]);
	if (*thrvar == NULL)                   /* lazy initialization */
	{
		*thrvar = ort_alloc(tr->size);
		memcpy(*thrvar, localvar, tr->size); /* Initialize from local var */
	}
	else
		(*tr->combiner)(*thrvar, localvar, tr->size);  /* reduce to thread var */
}


/**
 * @brief The compiler calls this to end the current task reduction scope.
 *        It is assumed that all threads have finished their all their 
 *        tasks. All reductions are effected onto the original variables here.
 */
void _ort_taskscope_end()
{
	ort_eecb_t *me = __MYCB;
	taskscope_t *ts = __CURRTASK(me)->taskscope;
	taskred_t *tr;
	void *res;
	int i, j;
	
	DBGPRN((stderr, "[task red. scope %p]: ending\n", ts));
	for (i = 0; i < ts->numreds; i++)
	{
		tr = &ts->reds[i];
		for (res = NULL, j = 0; j < me->num_siblings; j++)
			if (tr->thrvar[j])
			{
				DBGPRN((stderr, "  reduce thread-private copies (%p)\n",tr->thrvar[j]));
				/* reduce thread-private copies */
				if (res == NULL)   /* the first one found */
					res = tr->thrvar[j];
				else
				{  /* reduce to the first one */
					(*tr->combiner)(res, tr->thrvar[j], tr->size);
					free(tr->thrvar[j]);
				}
			};
		if (res)   /* now reduce to the original var */
		{
			(*tr->combiner)(tr->origvar, res, tr->size);
			free(res);
		}
		free(tr->thrvar);
	}
	__CURRTASK(me)->taskscope = ts->parent;    /* restore task scope */
	free(ts);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * TASKGROUPS                                                        *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* OpenMP 4.0 */
/*
 * This function sets the taskgroup flag of current task to
 * true in order to denote that the thread has entered a taskgroup
 * area. From now on all child tasks will get the taskgroup flag
 * also enabled. If a task has this flag enabled then an implicit
 * taskwait is called before this task finishes execution.
 */
void _ort_entering_taskgroup(void)
{
#if !defined(AVOID_OMPI_DEFAULT_TASKS)

	ort_eecb_t *me = __MYCB;
	taskgroup_t *new_tg = taskgroup_alloc();

	if (__INHERITASK(me))
		ort_create_task_immediate_node(me);

	new_tg->parent = __CURRTASK(me)->taskgroup;
	new_tg->is_canceled = 0;
	__CURRTASK(me)->taskgroup = new_tg;

#endif
}


/* OpenMP 4.0 */
/*
 * This function sets the taskgroup flag of current task to
 * false in order to denote that thread will exit a taskgroup
 * area. Before exiting the taskgroup area a taskwait is
 * executed.
 */
void _ort_leaving_taskgroup(void)
{
#if !defined(AVOID_OMPI_DEFAULT_TASKS)

	taskgroup_t *tg = __CURRTASK(__MYCB)->taskgroup;
	_ort_taskwait(0);                              /* wait for my children */
	__CURRTASK(__MYCB)->taskgroup = tg->parent;   /* restore previous taskgroup */
	taskgroup_free(tg);                           /* cleanup */

#endif
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * DEBUGGING                                                         *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

#if ORT_DEBUG & DBG_TASKS

static
void task_stats(void)
{
	ort_eecb_t *me = __MYCB;

	fprintf(stderr, "task stats @ thread %d:\n\t"
	        "             enqueued: %ld\n\t"
	        "  dequeued & executed: %ld\n\t"
	        "                stole: %ld\n\t"
	        "throttled (immediate): %ld\n\t"
	        " {\n\t"
	        "     due to full pool:   %ld\n\t"
	        "    due to full queue:   %ld\n\t"
	        "     due to if(FALSE):   %ld\n\t"
	        "         due to final:   %ld\n\t"
	        "  outside of parallel:   %ld\n\t"
	        "     rest (fast code):   %ld\n\t"
	        "    got in throttling:   %ld\n\t"
	        "got out of throttling:   %ld\n\t"
	        "      failed stealing:   %ld\n\t"
	        //" }\n\n",
	        " }\n\t"
	        "          idle->empty:   %ld\n\t"
	        "          empty->idle:   %ld\n\n",
	        me->thread_num,
	        me->tasking.tasks_enqueued,
	        me->tasking.tasks_executed_by_worker,
	        me->tasking.tasks_executed_by_thief,
	        me->tasking.throttled,
	        me->tasking.throttled_pool,
	        me->tasking.throttled_queue,
	        me->tasking.throttled_if,
	        me->tasking.throttled_final,
	        me->tasking.throttled_serial,
	        me->tasking.throttled - (me->tasking.throttled_pool +
	                                 me->tasking.throttled_queue +  me->tasking.throttled_if +
	                                 me->tasking.throttled_serial + me->tasking.throttled_final),
	        me->tasking.in_throttle,
	        me->tasking.out_throttle,
	        me->tasking.fail_theft_attemts,
	        me->tasking.idle2empty,
	        me->tasking.empty2idle
	       );

	me->tasking.tasks_enqueued = 0;
	me->tasking.tasks_executed_by_thief = 0;
	me->tasking.tasks_executed_by_worker = 0;
	me->tasking.throttled = 0;
	me->tasking.throttled_pool = 0;
	me->tasking.throttled_queue = 0;
	me->tasking.throttled_if = 0;
	me->tasking.throttled_final = 0;
	me->tasking.throttled_serial = 0;
	me->tasking.in_throttle = 0;
	me->tasking.out_throttle = 0;
	me->tasking.fail_theft_attemts = 0;
	me->tasking.empty2idle = 0;
	me->tasking.idle2empty = 0;
}

#endif

/* Undefine any macros */
#undef ATOMIC_ADD
#if defined(USE_ATOMICS)
	#undef USE_ATOMICS 
#endif
