/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* tasks.c
 * This file implements tasking
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include "globals.h"
#include "locks.h"
#include "tasks.h"
#include "barrier.h"
#include "queue.h"
#include "parallel.h"
#include "worksharing.h"

__SHAREDQLFR cuda_tasking_t tasking;


// __DEVQLFR
// void  ort_start_implicit_task(ort_eecb_t *thr)
// {
// }

// __DEVQLFR
// void  ort_finish_implicit_task(ort_eecb_t *thr)
// {
// }



// inline int ort_pending_tasks_left(private_eecb_t *me)
// {
// 	int i;

// 	for (i = 0; i < me->num_siblings; i++)
// 		if ((me->parent->tasking.pending_tasks[i] != NULL)
// 		    && (*(me->parent->tasking.pending_tasks[i]) > 0))
// 			return 1;

// 	return 0;
// }


__DEVQLFR 
void *_ort_taskenv_alloc(int size, void *(*task_func)(void *))
{
	ort_taskenv_t *newtaskenv = NULL;
	int i;
	__SHAREDQLFR int taskenvlock;
	if (__THRID == 0)
		taskenvlock = 0;
	__syncthreads();

	CUDA_SET_LOCK_PARTSYNC(taskenvlock, BARRIER_SLOT_1, __NTHR);
	for (i = 0; i < MAX_TASKENVS; i++)
	{
		if (!tasking.taskenvs[i].occupied)
		{
			newtaskenv = &(tasking.taskenvs[i]);
			break;
		}
	}

	if (newtaskenv == NULL) return NULL;

	newtaskenv->mem = malloc(size);
	newtaskenv->occupied = 1;
	newtaskenv->size = size;
	newtaskenv->task_func = task_func;

	tasking.num_taskenvs++;
	CUDA_UNSET_LOCK_PARTSYNC(taskenvlock);

	return newtaskenv->mem;
}

__DEVQLFR 
void _ort_taskenv_free(void *ptr, void *(*task_func)(void *))
{
	int i;
	__SHAREDQLFR int taskenvlockfree;
	if (__THRID == 0)
		taskenvlockfree = 0;

	CUDA_SET_LOCK_PARTSYNC(taskenvlockfree, BARRIER_SLOT_1, __NTHR);
	// free(ptr); TODO Fix double free
	for (i = 0; i < MAX_TASKENVS; i++)
	{
		if (task_func == tasking.taskenvs[i].task_func)
		{
			tasking.taskenvs[i].size = 0;
			tasking.taskenvs[i].occupied = 0;
			tasking.num_taskenvs--;
			break;
		}
	}
	CUDA_UNSET_LOCK_PARTSYNC(taskenvlockfree);

}


__DEVQLFR 
static bool enqueue_task(ort_task_node_t *task)
{		
	return bq_enqueue(&(tasking.tasks), task, sizeof(*task));
}


__DEVQLFR
static ort_task_node_t *create_task(void *(*func)(void *arg), void *arg, 
                                   int final)
{
	int ntid = -1; /* new task id */
	
	/* TODO: Caching */
	
	/* Find a slot for the new task */
	for (ntid = 0; ntid < tasking.tasks.max_size; ntid++)
		if (tasking.tpool[ntid].status < 2)
			break;
			
	if (ntid == tasking.tasks.max_size) return NULL;
	
	tasking.tpool[ntid].status = 2;	
	tasking.tpool[ntid].func = func;
	tasking.tpool[ntid].funcarg = arg;
	tasking.tpool[ntid].isfinal = final;
	
	return &(tasking.tpool[ntid]);
}


__DEVQLFR 
static ort_task_node_t *dequeue_task()
{
	bq_item_t *node = bq_dequeue(&(tasking.tasks));
	if (node == NULL) return (ort_task_node_t *) NULL;
	return (ort_task_node_t *) node->data;
}


__DEVQLFR
void _ort_new_task(void *(*func)(void *arg), void *arg,
                  int now, int final, int untied, int priority,
                  void **deparray, int noutdeps, int nindeps, int ninoutdeps)
{
	ort_task_node_t *curtask = tasking.current_task[__WARPID];
	ort_task_node_t *newtask;
	bool res;
	
	/* (1) Only warp masters can create tasks */
	if (!__ISWARPMASTER) return;
	if (!tasking.initialized) return;

	/* (2) Check if task should be executed now or if the currently executed task 
	 * is final: if yes, just call the task function. */
	if ((now) || (curtask && curtask->isfinal))
	{
		(*func)(arg);
		return;
	}
	
	/* (3) Found a new task from the pool, let's enqueue it */
	// dev_lock(&(tasking.enqlock));
	if ((newtask = create_task(func, arg, final)) != NULL)
	{
		res = enqueue_task(newtask);

		if (!res)
		{
			(*func)(arg);
		}
	}
	else
	{
		// dev_unlock(&(tasking.enqlock));
		(*func)(arg);
	}
	// dev_unlock(&(tasking.enqlock));
}

__DEVQLFR
int _ort_task_throttling(void)
{
	if (omp_get_num_threads() == 1)
		return 1;
		
	if (tasking.tasks.count > (int)(tasking.tasks.max_size * 0.7))
		return 1;
		
	if (tasking.tasks.count >= tasking.tasks.max_size)
		return 1;
	
	return 0;
}

__DEVQLFR
void *_ort_task_immediate_start(int final)
{
	return NULL;
}

__DEVQLFR
void _ort_task_immediate_end(void *tn)
{
}

__DEVQLFR
void tasking_init()
{
	int nwarps = __NWARPS;
	int i;
	
	if (tasking.initialized) return;
	
	for (i = 0; i < TASK_QUEUE_SIZE; i++)
		tasking.tpool[i].status = 0;
	
	tasking.bottom = 0;
	tasking.top = 0;
	tasking.lock = 0;
	
	for (i = 0; i < 32; i++)
		tasking.current_task[i] = NULL;
		
	tasking.initialized = true;
	tasking.num_taskenvs = 0;
	
	for (i = 0; i < MAX_TASKENVS; i++)
	{
		tasking.taskenvs[i].occupied = 0;
		tasking.taskenvs[i].size = 0;
		tasking.taskenvs[i].task_func = NULL;
		tasking.taskenvs[i].mem = NULL;
	}

	bq_init(&(tasking.tasks), TASK_QUEUE_SIZE, nwarps);
}

__DEVQLFR
bool _dev_is_executing_task()
{
	if (!tasking.initialized) return false;
	return (tasking.current_task[__WARPID] != NULL);
}


__DEVQLFR void start_throttling_due_to_full_queue(void)
{

}

__DEVQLFR
static void execute_any_task()
{
	ort_task_node_t *tnode = NULL;
	if (!tasking.initialized) return;
	
	while (true)
	{
		if (tasking.tasks.count <= 0) break;

		/* Only warp masters dequeue task functions from the tasks BQ.
		 * Then they broadcast them to the remaining warp threads.
		 */
		if (__ISWARPMASTER)
		{   
			/* Lock the queue */
			dev_lock(&(tasking.lock));
			tasking.current_task[__WARPID] = dequeue_task();
			dev_unlock(&(tasking.lock));			
		}
		
		__syncwarp();
		
		/* No tasks left; exit. */
		if (!tasking.current_task[__WARPID]) break;
		
		tnode = tasking.current_task[__WARPID];
		if (tnode->func)
		{
			(tnode->func)(tnode->funcarg);
		}
		
		__syncwarp();
		
		/* We have finished the task, reset the state */
		if (__ISWARPMASTER)
			tasking.current_task[__WARPID] = NULL;
			
		__syncwarp();
	}
	
	dev_barrier_wait(BARRIER_SLOT_2, dev_get_parallel_active_threads());
}

__DEVQLFR 
void _ort_taskwait(int how) 
{
	if (_dev_is_executing_task()) return;
	if (how < 2)
		execute_any_task();
	else
	{
		#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
		dev_barrier_wait(BARRIER_SLOT_2, dev_get_parallel_active_threads());
	    #else
	    __syncthreads();
	    #endif
		execute_any_task();
	}
}