/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* tasks.c -- OMPi RunTime library: tasking */

/*
 * 2021/12/17:
 *   major refactoring
 * [...]
 * 2010/11/20:
 *   added calls for getting & setting task icvs..
 * Version 1.0.1j:
 *   first time around, out of ort.c code.
 */

#include "ort_prive.h"
#include <stdlib.h>
#include <stdio.h>

#define FAILURE 0
#define NO_TASKS_LEFT -1
#define NO_VICTIM_LEFT -1


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * GLOBAL VARIABLES / DEFINITIONS / MACROS                           *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* Per-thread task throttling */
#ifdef USE_TLS
	TLS_KEYWORD int throttle;
	#define __start_throttling() (throttle = 1)
	#define __stop_throttling()  (throttle = 0)
	#define __check_throttling() (throttle + 0)
#else
	ee_key_t   throttle_key;    /* For thread-specific task throttling */
	#define __start_throttling() ee_setspecific(throttle_key,(void *) 1)
	#define __stop_throttling()  ee_setspecific(throttle_key,(void *) 0)
	#if !defined(AVOID_OMPI_DEFAULT_TASKS)
		#define __check_throttling() (0 != (uintptr_t) ee_getspecific(throttle_key))
	#else
		#define __check_throttling() (ee_check_throttling())
	#endif
#endif


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * QUEUES AND STEALING                                               *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* Allocate memory for my group task queues */
inline void ort_taskqueues_init(ort_eecb_t *me, int nthr)
{
#if !defined(AVOID_OMPI_DEFAULT_TASKS)
	int i;

	/* Thread becomes parent for first time */
	if (me->tasking.queue_table == NULL)
	{
		me->tasking.queue_table = (ort_task_queue_t *)
		                          ort_calloc((nthr + 1) * sizeof(ort_task_queue_t));
		for (i = 0; i < nthr + 1; i++) /* Initialize task queues */
		{
			me->tasking.queue_table[i].top         = 0;
			me->tasking.queue_table[i].bottom      = 0;
			me->tasking.queue_table[i].implicit_task_children = NULL;
			me->tasking.queue_table[i].tasks = (ort_task_node_t **)
			                    ort_calloc(TASKQUEUESIZE * sizeof(ort_task_node_t *));
#if !defined(HAVE_ATOMIC_FAA) || !defined(HAVE_ATOMIC_CAS)
			ee_init_lock((ee_lock_t *) & (me->tasking.queue_table[i].lock),
			             ORT_LOCK_NORMAL);
#endif
		}
		me->tasking.max_children = nthr + 1;
	}
	else
	{
		if (me->tasking.max_children < nthr + 1)  /* realloc needed */
		{
			for (i = 0; i < me->tasking.max_children; i++)   /* Init task queues */
				free(me->tasking.queue_table[i].tasks);

			/* Reallocate queue_table */
			me->tasking.queue_table = (ort_task_queue_t *)
			      realloc(me->tasking.queue_table, (nthr+1)*sizeof(ort_task_queue_t));
			for (i = 0; i < nthr + 1; i++)
				me->tasking.queue_table[i].tasks = (ort_task_node_t **)
				                  ort_calloc(TASKQUEUESIZE * sizeof(ort_task_node_t *));
#if !defined(HAVE_ATOMIC_FAA) || !defined(HAVE_ATOMIC_CAS)
			for (i = me->tasking.max_children; i < nthr + 1; i++)
				ee_init_lock((ee_lock_t *) & (me->tasking.queue_table[i].lock),
				             ORT_LOCK_NORMAL);
#endif
			me->tasking.max_children = nthr + 1;
		}
		/* Reinitialize queue table elements */
		for (i = 0; i < nthr + 1; i++)
		{
			me->tasking.queue_table[i].top      = 0;
			me->tasking.queue_table[i].bottom   = 0;
			me->tasking.queue_table[i].implicit_task_children = NULL;
		}
	}
#endif
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)

/**
 * @brief Add a new tasknode in my queue.
 * 
 * This function is called by a thread (worker) in order to enqueue a new 
 * task_node to its own task queue. It has been guaranteed that there is
 * indeed space in the queue.
 */
inline void ort_enqueue_task(ort_eecb_t *me, ort_task_node_t *tnode)
{
	int worker_id = me->thread_num;
	ort_eecb_t *my_parent = me->sdn;
	int old_bottom = atomic_read
	                 (&(my_parent->tasking.queue_table[worker_id].bottom));

	if (tnode->parent != NULL)  /* Add a new child to parent task */
	{
#if defined(HAVE_ATOMIC_FAA)
		_faa(&((tnode->parent)->num_children), 1);
#else
		ee_set_lock(&((tnode->parent)->lock));
		(tnode->parent)->num_children++;
		ee_unset_lock(&((tnode->parent)->lock));
#endif
	}
	my_parent->tasking.queue_table[worker_id].tasks[old_bottom % TASKQUEUESIZE] =
		      tnode;
	my_parent->tasking.queue_table[worker_id].bottom++;
    NON_TSO_FENCE;

#ifdef ORT_DEBUG
	me->tasking.tasks_enqueued++;
#endif
}


/**
 * @brief Extract a task from my own queue
 * 
 * This function is used to dequeue a task_node from my own task queue.
 */
static inline ort_task_node_t *dequeue_task(ort_eecb_t *me)
{
	ort_task_node_t *extracting_node;
	ort_eecb_t *my_parent = me->sdn;
	int worker_id = me->thread_num;
	int old_top;
	int new_top;
	int size;

	/* Make a first fast check */
	size = atomic_read(&(my_parent->tasking.queue_table[worker_id].bottom))
	       - atomic_read(&(my_parent->tasking.queue_table[worker_id].top)) - 1;

	/* If my queue is almost full, it is safe to enter throttle mode */
	if (size > (int)(TASKQUEUESIZE * 0.7))
		start_throttling_due_to_full_queue();

	if (size < 0) /* Queue is empty */
		return NULL;

#if defined(HAVE_ATOMIC_FAA)
	_faa(&(my_parent->tasking.queue_table[worker_id].bottom), -1);
#else
	ee_set_lock(&(my_parent->tasking.queue_table[worker_id].lock));
	my_parent->tasking.queue_table[worker_id].bottom--;
	ee_unset_lock(&(my_parent->tasking.queue_table[worker_id].lock));
#endif

	old_top = atomic_read(&(my_parent->tasking.queue_table[worker_id].top));
	new_top = old_top + 1;
	size = atomic_read(&(my_parent->tasking.queue_table[worker_id].bottom))
	       - old_top;

	if (size < 0) /* Queue is empty */
	{
		my_parent->tasking.queue_table[worker_id].bottom = old_top;
        NON_TSO_FENCE;
		return NULL;
	}

	extracting_node = my_parent->tasking.queue_table[worker_id]
	                  .tasks[atomic_read(&((my_parent->tasking
	                  .queue_table[worker_id]).bottom)) % TASKQUEUESIZE];
	if (size > 0)
		return extracting_node;

	/* If there is only one task left in queue... */
#if defined(HAVE_ATOMIC_CAS)
	/* If a thief stole the last task... */
	if (!_cas(&((my_parent->tasking.queue_table[worker_id]).top),
	          old_top, new_top))
		extracting_node = NULL;/* then return NULL, else return the last task */
#else
	ee_set_lock(&((my_parent->tasking.queue_table[worker_id]).lock));

	if ((my_parent->tasking.queue_table[worker_id]).top == old_top)
		(my_parent->tasking.queue_table[worker_id]).top = new_top;
	else
		extracting_node = NULL;

	ee_unset_lock(&((my_parent->tasking.queue_table[worker_id]).lock));
#endif

	/* old_top + 1 = new_top */
	my_parent->tasking.queue_table[worker_id].bottom = old_top + 1;

	return extracting_node;
}


/**
 * @brief Try to steal a task from another thread
 * 
 * This function is used by in order to dequeue a new task_node from a 
 * victim task queue; I become a thief.
 */
static inline ort_task_node_t *steal_task(ort_eecb_t *me, int victim_id)
{
	ort_task_node_t *extracting_node;
	ort_eecb_t *my_parent = me->sdn;
	int old_top = atomic_read(&(my_parent->tasking.queue_table[victim_id].top));
	int new_top = old_top + 1;
	int old_bottom = atomic_read(
	                         &(my_parent->tasking.queue_table[victim_id].bottom));
	int size = old_bottom - old_top;

	/* If my queue is almost full, it is safe to enter throttle mode */
	if (me->thread_num == victim_id && size > (int)(TASKQUEUESIZE * 0.7))
		start_throttling_due_to_full_queue();

	if (size <= 0) /* Victim's queue is empty */
		return NULL;
	/* Steal a task from vitim's top! */
	extracting_node = my_parent->tasking.queue_table[victim_id]
	                  .tasks[old_top % TASKQUEUESIZE];

#if defined(HAVE_ATOMIC_CAS)
	/* if thief managed to steal the task... */
	if (_cas(&((my_parent->tasking.queue_table[victim_id]).top),
	         old_top, new_top))
		return extracting_node;
#else
	ee_set_lock(&((my_parent->tasking.queue_table[victim_id]).lock));
	if ((my_parent->tasking.queue_table[victim_id]).top == old_top)
	{
		(my_parent->tasking.queue_table[victim_id]).top = new_top;
		ee_unset_lock(&((my_parent->tasking.queue_table[victim_id]).lock));
		return extracting_node;
	}
	ee_unset_lock(&((my_parent->tasking.queue_table[victim_id]).lock));
#endif

	return NULL;
}
#endif /* AVOID_OMPI_DEFAULT_TASKS */


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * TASKS                                                             *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


void start_throttling_due_to_full_queue(void)
{
#if ORT_DEBUG & DBG_TASKS
	__MYCB->tasking.throttled_queue++;
	__MYCB->tasking.in_throttle++;
#endif
	__start_throttling();
}


void ort_init_tasking()
{
#ifdef USE_TLS
#else
	ee_key_create(&throttle_key, 0);
#endif
	__stop_throttling();
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)

/* ort_task_check_throttling
 * This function checks whether a thread's task queue
 * is 70% full of tasks. If not thread will stop throttling
 */
inline static void ort_task_check_throttling(ort_eecb_t *me)
{
	ort_eecb_t *my_parent = me->sdn;
	int my_id = me->thread_num;
	int old_bottom, old_top;

	/* Now check that i have enough space in Task Queue */
	old_bottom = atomic_read
	             (&(my_parent->tasking.queue_table[my_id].bottom));
	old_top = atomic_read
	          (&(my_parent->tasking.queue_table[my_id].top));

	/* If my queue is less than 70% full */
	if ((old_bottom - old_top) < (int)(TASKQUEUESIZE * 0.7))
	{
		__stop_throttling();
#if ORT_DEBUG & DBG_TASKS
		me->tasking.out_throttle++;
#endif
	}

	return;
}

#endif /* AVOID_OMPI_DEFAULT_TASKS */


void ort_create_task_immediate_node(ort_eecb_t *thr)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return;
#else
	ort_task_node_t *new_node;

	/* Create a task node in order to save task data */
	new_node = ort_task_alloc(NULL, NULL);

#if !defined(HAVE_ATOMIC_FAA)
	ee_init_lock((ee_lock_t *) & (new_node->lock), ORT_LOCK_NORMAL);
#endif
	new_node->func              = NULL;
	new_node->num_children      = 1; /* Ensures task is complete before freeing */
	new_node->next              = NULL;
	new_node->parent            = __CURRTASK(thr);
	new_node->icvs              = __CURRTASK(thr)->icvs;
	new_node->inherit_task_node = 0;
	new_node->isfinal           = __FINALTASK(thr);
	new_node->taskgroup         = __CURRTASK(thr)->taskgroup;
	new_node->rtid              = 0;
	new_node->dependencies      = NULL;

	/* I have my own task node now, Reduce parent task's counter */
	__INHERITASK(thr) --;

	/* I have my own task node now, Reduce parent task's final counter */
	if (__FINALTASK(thr) > 0)
		__FINALTASK(thr)--;

	__CURRTASK(thr) = new_node;

	/* Check whether i have to stop throttling */
	if (thr->num_siblings != 1)
		ort_task_check_throttling(thr);

	return;
#endif
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)

#define NP(X) (void*)(((char*)(X)) + SIZEOF_CHAR_P)  /* get next pointer */
#define PP(X) (((char*)(X)) - SIZEOF_CHAR_P)     /* get previous pointer */


void ort_task_execute_this(ort_eecb_t *me, ort_task_node_t *task_to_execute)
{
	ort_task_node_t *prev_task_to_execute;
	int             task_siblings, exec_task_children;

	if (task_to_execute->rtid == -1)  /* If i am to execute a normal task */
	{
		/* First update thread executing status... */
		prev_task_to_execute = __CURRTASK(me);
		__CURRTASK(me) = task_to_execute;

		/* Function arguments pointer is stored in next pointer */
		if (!me->sdn->mf->cancel_par_active)
		{
			if (__CURRTASK(me)->taskgroup == NULL)
				(task_to_execute->func)(NP(task_to_execute->funcarg));
			else
				if (__CURRTASK(me)->taskgroup->is_canceled == false)
					(task_to_execute->func)(NP(task_to_execute->funcarg));
		}

		/* OpenMP 4.0:
		 * If in a taskgroup, i have to execute my child tasks before returning
		 */
		if (task_to_execute->taskgroup != NULL)
			ort_taskwait(0);

		/* Finished my job; update thread executing status */
		if (task_to_execute->parent->dependencies)
			tdeps_after_execution(task_to_execute, me);
		__CURRTASK(me) = prev_task_to_execute;
	}
	else /* A special parallel for task */
		spwtasks_execute_node(me, task_to_execute);

	if (task_to_execute->parent != NULL)   /* If task has a parent */
	{
#if defined(HAVE_ATOMIC_FAA)
		task_siblings = _faa(&((task_to_execute->parent)->num_children), -1);
#else
		ee_set_lock(&((task_to_execute->parent)->lock));
		task_siblings = task_to_execute->parent->num_children;
		task_to_execute->parent->num_children--;
		ee_unset_lock(&((task_to_execute->parent)->lock));
#endif
		if (task_siblings == 1)
		{
			if (task_to_execute->parent->dependencies)
				tdeps_free_tdepinfo(task_to_execute->parent->dependencies);
			ort_task_free(me, task_to_execute->parent);
		}
	}

	/* This task is over, subtract the virtual child of this task */
#if defined(HAVE_ATOMIC_FAA)
	exec_task_children = _faa(&(task_to_execute->num_children), -1);
#else
	ee_set_lock(&((task_to_execute)->lock));
	exec_task_children = task_to_execute->num_children;
	task_to_execute->num_children--;
	ee_unset_lock(&((task_to_execute)->lock));
#endif

	/* Free memory, if task has no children */
	if (exec_task_children == 1)
	{
		if (task_to_execute->dependencies)
			tdeps_free_tdepinfo(task_to_execute->dependencies);
		ort_task_free(me, task_to_execute);
	}
}


/* execute_any_task:
 * When a thread calls this functions, it dequeues a task from its queue,
 * or steals a task from a victim and executes it.
 */
static int execute_any_task(ort_eecb_t *me, int startfrom)
{
	int thread_id = me->thread_num;
	ort_eecb_t *my_parent = me->sdn;
	ort_task_node_t *task_to_execute;
	int victim;
	int my_team_members = my_parent->mf->num_children;
	int my_thread_id = thread_id;
	int search_limit;

	if (ort->ompi_steal_policy == LIFO)
		task_to_execute = dequeue_task(me);
	else
		task_to_execute = steal_task(me, thread_id);

	/* My task queue is empty; have to steal something... */
	if (task_to_execute == NULL)
	{
		if (startfrom >= 0)              /* Start stealing from there */
		{
			thread_id    = startfrom - 1;
			search_limit = my_team_members + thread_id + 1;
		}
		else
			search_limit = my_team_members + thread_id;

		for (victim = thread_id + 1; victim < search_limit; victim++)
		{
			if (victim == my_thread_id)
				continue;

			task_to_execute = steal_task(me, victim % my_team_members);
			/* If victim's queue is also empty, try next victim */
			if (task_to_execute == NULL)
			{
#if ORT_DEBUG & DBG_TASKS
				me->tasking.fail_theft_attemts++;
#endif
				continue;
			}

			ort_task_execute_this(me, task_to_execute);
#if ORT_DEBUG & DBG_TASKS
			me->tasking.tasks_executed_by_thief++;
#endif
			return victim % (my_parent->mf->num_children);
		}
		return -1; /* There was no task left to execute */
	}
	else
	{
		ort_task_execute_this(me, task_to_execute);
#if ORT_DEBUG & DBG_TASKS
		me->tasking.tasks_executed_by_worker++;
#endif
		return me->thread_num;
	}
}


/* Only execute tasks from my queue (i.e. do not attempt to steal)
 */
void ort_execute_my_tasks(ort_eecb_t *me)
{
	ort_task_node_t *tnode;

	for (;;)
	{
		if (ort->ompi_steal_policy == LIFO)
			tnode = dequeue_task(me);
		else
			tnode = steal_task(me, me->thread_num);

		if (tnode == NULL)     /* drained */
			return;

		ort_task_execute_this(me, tnode);
#if ORT_DEBUG & DBG_TASKS
		me->tasking.tasks_executed_by_worker++;
#endif
	}
}

#endif /* AVOID_OMPI_DEFAULT_TASKS */


/* This function creates a new task node and sets this task as the
 * calling thread's current task. Called when a task is about to be
 * executed immediately.
 */
inline void *ort_task_immediate_start(int final)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return (void *)ee_task_immediate_start(final);
#else
	ort_eecb_t *me = __MYCB;

#if ORT_DEBUG & DBG_TASKS
	me->tasking.throttled++;
#endif

	/* Speed up immediate task execution; I inherited task from my father */
	__INHERITASK(me) ++;

	/* Increase final counter for "final" information bookkeeping */
	if (__FINALTASK(me) > 0 || final > 0)
		__FINALTASK(me)++;

	/* Check whether i have to stop throttling */
	if (me->num_siblings != 1)
		ort_task_check_throttling(me);

	return me;
#endif
}


inline void ort_task_immediate_end(void *my_eecb)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	ee_task_immediate_end(my_eecb);
#else
	ort_eecb_t      *me = (ort_eecb_t *)my_eecb;
	ort_task_node_t *task_node;

	if (__INHERITASK(me) > 0)
	{
		/* I executed a final task immemdiately, information bookkeeping */
		if (__FINALTASK(me) > 0)
			__FINALTASK(me)--;

		/* I inherited task from my father, nothing to do */
		__INHERITASK(me) --;
		return;
	}

	task_node = __CURRTASK(me);
	__CURRTASK(me) = task_node->parent;    /* Restore task node */

	if (task_node->num_children == 1)
		ort_task_free(me, task_node);          /* Recycle task node */
#endif
}


/**
 * Create a new task.
 * Here it is assumed that the compiler has already issued a check for no
 * throttling. Since only me can put tasks in my queue, no throttling
 * guarantees that I do have space for a new task.
 * However, because of OpenMP v45 #target-related constructs, there is a
 * chance that this function gets called without a prior check for throttling.
 * Consequently, we go through a seemingly redunant throttling check.
 * This check also enables the implementation of a single (but slow) tasking
 * code generation, if the user so desires (yeah, sure...).
 * @param func:       the task function
 * @param arg:        the argument to the task function
 * @param now:        if true, execute the task immediately
 * @param final:      came with a final clause
 * @param untied:     came with an untied clause
 * @param deparray:   array with all dependences
 * @param noutdeps:   # out dependences
 * @param nindeps:    # in dependences
 * @param ninoutdeps: # inout dependences
 */
void ort_new_task(void *(*func)(void *arg), void *arg,
                  int now, int final, int untied, int priority,
                  void **deparray, int noutdeps, int nindeps, int ninoutdeps)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	ee_new_task(final, untied, func, arg);
#else
	ort_eecb_t      *me = __MYCB;
	ort_task_node_t *tnode;
	int             mefinal = ( __FINALTASK(me) > 0 );

	if (priority < 0) priority = 0;
	if (priority > ort->icvs.max_task_prio) priority = ort->icvs.max_task_prio;

	/* If dependencies are present, follow another route (unless I am final) */
	if  (deparray && !mefinal)
	{
		tdeps_issue_task(ort_task_alloc_init(func, arg, final, -1, me),
		                 deparray, noutdeps, nindeps, ninoutdeps);
		return;
	}

	/* Check whether i am a final task or if i should throttle (VVD, OpenMP v45);
	 * if the latter test seems to slow things down, we should add a new
	 * parameter to ort_new_task() to inform it whether the throttlilng check
	 * has already been performed.
	 */
	if (now || mefinal || ort_task_throttling())
	{
#if ORT_DEBUG & DBG_TASKS
		if (mefinal)
			me->tasking.throttled_final++;
#endif

		/* If so then execute immediately; my children will also be final */
		tnode = ort_task_immediate_start(mefinal);
		(*func)(arg);

		if (arg != NULL)
			ort_task_free(me, *((ort_task_node_t **)PP(arg)));
		ort_task_immediate_end(tnode);

		return;
	}

	ort_enqueue_task(me, ort_task_alloc_init(func, arg, final, -1, me));
	testnotset(me->sdn->tasking.never_task);      /* Inform my mates */
#endif
}


#if !defined(AVOID_OMPI_DEFAULT_TASKS)
static int check_for_tasks(ort_eecb_t *me)
{
	int              teamsize = me->num_siblings, victim, retry = NO_VICTIM_LEFT;
	ort_task_queue_t *q = me->sdn->tasking.queue_table;

	/* Search for # of unfinished tasks in my mates queues */
	for (victim = me->thread_num+1; victim < teamsize + me->thread_num; victim++)
	{
		NON_TSO_FENCE;
		if ((q[victim % teamsize]).implicit_task_children != NULL &&
		    *((q[victim % teamsize]).implicit_task_children) > 1)
			return (victim % teamsize);
	}

	return retry;
}
#endif


/* How = 0 (wait for my children),
 *       1 (wait for all team tasks),
 *       2 (wait at the end of parallel)
 */
void ort_taskwait(int how)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	ort_eecb_t *me = __MYCB;
	if (me->parent)
		ee_taskwait(how, me->parent->ee_info, me->thread_num);
	else
		ee_taskwait(how, NULL, me->thread_num);
#else
	ort_eecb_t *me = __MYCB;
	int        victim = NO_VICTIM_LEFT;

	if (me->num_siblings == 1)
		return;
	else if (how < 2 && me->parent->tasking.never_task == 0)
		return;
	else if (how == 2)
	{
		if(__CURRTASK(me)->rtid == -1) /* Normal task lightweight barrier */
			parallel_barrier_wait(TEAMINFO(me)->barrier, me->thread_num);
		else /* Parallel for task lightweight barrier */
			goto EXECUTE_TEAM_TASKS;
				return;
	};

	if (how > 0)   /* help with all the tasks in current team */
	{
		EXECUTE_TEAM_TASKS:
		do
		{
			while ((victim = execute_any_task(me, victim)) != NO_TASKS_LEFT)
				;
		}
		while ((victim = check_for_tasks(me)) != NO_VICTIM_LEFT);
	}
	else           /* execute till all my child tasks finish */
		while (__CURRTASK(me)->num_children > 1)
		{
			NON_TSO_FENCE;
			execute_any_task(me, victim);
		}
#endif
}


/* Task throttling.
 * For the moment, this is a per-thread flag that should be adjusted
 * adaptively.
 * A simple policy would be to __start_throttling() when the number of
 * tasks in my private queue exceeds c*N where c is a constant and N
 * is the number of processors. If later I discover that the number
 * fell below this threshold, I __stop_throttling().
 */
int ort_task_throttling(void)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return __check_throttling();
#else

	ort_eecb_t *me = __MYCB;

	/* Check if i am already throttling */
	if (__check_throttling())
		return 1;
	/* Check if my team consists of one thread */
	if (me->num_siblings == 1)
	{
#if ORT_DEBUG & DBG_TASKS
		me->tasking.throttled_serial++;
#endif
		__start_throttling();
		return 1;
	}

	/* Check queue */
	{
		ort_task_queue_t *Q = &(me->sdn->tasking.queue_table[me->thread_num]);
		int old_bottom      = atomic_read( &(Q->bottom) );
		int old_top         = atomic_read( &(Q->top) );

		if ((old_bottom - old_top) >= TASKQUEUESIZE) /* Q about to be full */
		{
#if ORT_DEBUG & DBG_TASKS
			me->tasking.throttled_queue++;
			me->tasking.in_throttle++;
#endif
			__start_throttling();
			return 1;
		}
		else
			return 0;  /* No reason to throttle */
	}

#endif
}


/* Only called from othr.c, when in nestable locks */
void *ort_get_current_task()
{
	return (void *) __CURRTASK(__MYCB);
}


void ort_start_implicit_task(ort_eecb_t *thr)
{
	ort_eecb_t      *parent_thread = thr->parent;
	ort_task_node_t *tnode, *parent_task;

#if defined(AVOID_OMPI_DEFAULT_TASKS)

	ee_start_implicit_task(&thr->ee_info, parent_thread->ee_info);
	return;

#else
	task_pools_init(thr);

	/* Check whether i use my own task node or an inherited one */
	if (__INHERITASK(parent_thread))
		ort_create_task_immediate_node(parent_thread);

	parent_task = __CURRTASK(parent_thread);

	tnode = ort_task_alloc(NULL, NULL);
	tnode->rtid         = -1; /* Not a parallel for task */
	tnode->func         = NULL;
	tnode->num_children = 1; /* To ensure a task is complete before freeing it */
	tnode->next         = NULL;
	tnode->parent       = parent_task;
	tnode->inherit_task_node = 0;
	tnode->icvs         = parent_task->icvs;
	tnode->isfinal      = 0;
	tnode->taskgroup    = NULL;
	tnode->dependencies = NULL;  /* OpenMP 4.0 task dependencies */
	
	/* OpenMP 3.1 */
	if (thr->activelevel != parent_thread->activelevel) /* i.e. not a team of 1 */
		if (thr->activelevel < ort->set_nthrlevs)  /* Use the user-supplied value */
			tnode->icvs.nthreads = ort->nthr_per_level[thr->activelevel];

	/* OpenMP 4.0 */
	if (thr->activelevel != parent_thread->activelevel) /* i.e. not a team of 1 */
		if (thr->activelevel-1 < ort->set_bindlevs)/* Use the user-supplied value */
			tnode->icvs.proc_bind = ort->bind_per_level[thr->activelevel-1];

	/* Save # of children in order to use it in barrier task wait */
	(parent_thread->tasking.queue_table[thr->thread_num]).implicit_task_children
	  = &(tnode->num_children);

#if defined(HAVE_ATOMIC_FAA)
	_faa(&(parent_task->num_children), 1);
#else
	ee_set_lock(&(parent_task->lock));
	(parent_task->num_children)++;
	ee_unset_lock(&(parent_task->lock));
#endif
	__SETCURRTASK(thr, tnode);
	__SETCURRIMPLTASK(thr, tnode);

#endif
}


void ort_finish_implicit_task(ort_eecb_t *thr)
{
#if defined(AVOID_OMPI_DEFAULT_TASKS)
	return;
#else
	ort_task_node_t *tnode = __CURRTASK(thr);

#if defined(HAVE_ATOMIC_FAA)
	_faa(&(tnode->parent->num_children), -1);
#else
	ee_set_lock(&(tnode->parent->lock));
	tnode->parent->num_children--;
	ee_unset_lock(&(tnode->parent->lock));
#endif

	if (thr->num_siblings > 1)   /* lightweight barrier: */
		ort_taskwait(1);           /* basicaly help with any tasks you can find */

#if ORT_DEBUG & DBG_TASKS
	{
		void ort_task_stats(void);
		ort_task_stats();
	}
#endif

	__SETCURRTASK(thr, tnode->parent);
	__SETCURRIMPLTASK(thr, tnode->parent);

	if (tnode->dependencies)
		tdeps_free_tdepinfo(tnode->dependencies);
	ort_task_free(thr, tnode);              /* recycle task node */
#endif
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * TASKGROUPS                                                        *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* OpenMP 4.0 */
/*
 * This function sets the taskgroup flag of current task to
 * true in order to denote that the thread has entered a taskgroup
 * area. From now on all child tasks will get the taskgroup flag
 * also enabled. If a task has this flag enabled then an implicit
 * taskwait is called before this task finishes execution.
 */
void ort_entering_taskgroup(void)
{
#if !defined(AVOID_OMPI_DEFAULT_TASKS)

	ort_eecb_t *me = __MYCB;
	taskgroup_t *new_tg = taskgroup_alloc();

	if (__INHERITASK(me))
		ort_create_task_immediate_node(me);

	new_tg->parent = __CURRTASK(me)->taskgroup;
	new_tg->is_canceled = 0;
	new_tg->next = NULL;

	__CURRTASK(me)->taskgroup = new_tg;

#endif
}


/* OpenMP 4.0 */
/*
 * This function sets the taskgroup flag of current task to
 * false in order to denote that thread will exit a taskgroup
 * area. Before exiting the taskgroup area a taskwait is
 * executed.
 */
void ort_leaving_taskgroup(void)
{
#if !defined(AVOID_OMPI_DEFAULT_TASKS)

	/* Wait for my children */
	ort_taskwait(0);
	taskgroup_t *deleted_tg;

	/* Taskgroup are is finished */
	deleted_tg = __CURRTASK(__MYCB)->taskgroup;
	__CURRTASK(__MYCB)->taskgroup = __CURRTASK(__MYCB)->taskgroup->parent;
	taskgroup_free(deleted_tg);

#endif
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * DEBUGGING                                                         *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */

#if ORT_DEBUG & DBG_TASKS

void ort_task_stats(void)
{
	ort_eecb_t *me = __MYCB;

	fprintf(stderr, "task stats @ thread %d:\n\t"
	        "             enqueued: %ld\n\t"
	        "  dequeued & executed: %ld\n\t"
	        "                stole: %ld\n\t"
	        "throttled (immediate): %ld\n\t"
	        " {\n\t"
	        "     due to full pool:   %ld\n\t"
	        "    due to full queue:   %ld\n\t"
	        "     due to if(FALSE):   %ld\n\t"
	        "         due to final:   %ld\n\t"
	        "  outside of parallel:   %ld\n\t"
	        "     rest (fast code):   %ld\n\t"
	        "    got in throttling:   %ld\n\t"
	        "got out of throttling:   %ld\n\t"
	        "      failed stealing:   %ld\n\t"
	        " }\n\n",
	        me->thread_num,
	        me->tasking.tasks_enqueued,
	        me->tasking.tasks_executed_by_worker,
	        me->tasking.tasks_executed_by_thief,
	        me->tasking.throttled,
	        me->tasking.throttled_pool,
	        me->tasking.throttled_queue,
	        me->tasking.throttled_if,
	        me->tasking.throttled_final,
	        me->tasking.throttled_serial,
	        me->tasking.throttled - (me->tasking.throttled_pool +
	                                 me->tasking.throttled_queue +  me->tasking.throttled_if +
	                                 me->tasking.throttled_serial + me->tasking.throttled_final),
	        me->tasking.in_throttle,
	        me->tasking.out_throttle,
	        me->tasking.fail_theft_attemts
	       );

	me->tasking.tasks_enqueued = 0;
	me->tasking.tasks_executed_by_thief = 0;
	me->tasking.tasks_executed_by_worker = 0;
	me->tasking.throttled = 0;
	me->tasking.throttled_pool = 0;
	me->tasking.throttled_queue = 0;
	me->tasking.throttled_if = 0;
	me->tasking.throttled_final = 0;
	me->tasking.throttled_serial = 0;
	me->tasking.in_throttle = 0;
	me->tasking.out_throttle = 0;
	me->tasking.fail_theft_attemts = 0;
}

#endif
