/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* parallel.c
 * Handle parallel regions within a CUDA kernel
 */

#include <stdio.h>
#include "globals.h"
#include "locks.h"
#include "parallel.h"
#include "worksharing.h"
#include "barrier.h"
#include "tasks.h"

/* Parallel control variables */
#if PARALLEL_MEMTYPE == MEMTYPE_SHARED
    __SHAREDQLFR ort_parallel_t parblock;
	__SHAREDQLFR ort_parallel_t warpblocks[16];
#else
    __DEVQLFR ort_parallel_t parblocks[MAX_PARALLEL_BLOCKS];
	__DEVQLFR ort_parallel_t warpblocks[32][16];
#endif

__DEVQLFR
int dev_get_parallel_active_threads(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return p->execthr;
}

__DEVQLFR
static void dev_set_parallel_active_threads(int num_threads)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	p->execthr = num_threads;
}

#if PARALLEL_SCHEME == SCHEME_IFMASTER
__DEVQLFR 
static void enter_gpu_parallel_region(int num_threads)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	if (omp_get_thread_num() == 0)
	{
		p->activelevel = 1;
		p->execthr = (num_threads == -1) 
			? omp_get_num_threads() 
			: num_threads;
	}
		
	__syncthreads();
}


__DEVQLFR 
static void leave_gpu_parallel_region(int num_threads)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	if (omp_get_thread_num() == 0)
	{
		p->execthr = 1;
        p->activelevel = 0;
	}

	__syncthreads();
}
#endif


/* Per-warp execution of parallel regions within a task. */
__DEVQLFR
static void _task_parallel(void *(*func)(void *), void *shared, int num_threads)
{
	ort_parallel_t *p = &(__WARPBLOCK);
	__CHECKPARBLOCK(p);
	
	/* Warp masters initialize the parallel region CB */
	if (__ISWARPMASTER)
	{
		p->activelevel = (__PARBLOCK.activelevel + 1);
		if (num_threads == -1)
			num_threads = warpSize; /* whole warp */
		
		dev_set_parallel_active_threads(num_threads);
	}
	
	__syncwarp();
	
	if ((omp_get_thread_num() % warpSize) < dev_get_parallel_active_threads())
		func(shared);
		
	__syncwarp();
}


/* Combined parallel regions, entered by all threads. */
__DEVQLFR
static void handle_combined_parallel(void *(*func)(void *), void *shared, int num_threads)
{
	thread_cb_t *cb = __MYCB;
	ort_parallel_t *p = &(__PARBLOCK);
	int thrid = __THRID, nthr = __NTHR;
	
	if (!cb) return;
	__CHECKPARBLOCK(p);
	
	if (p->activelevel == 1)
	{
		__MYCB->in_nested_parallel = true;
		func(shared);
		__MYCB->in_nested_parallel = false;
		return;
	}
	/* Parallel regions within tasks require special handling */
	else
		if (_dev_is_executing_task())
		{
			_task_parallel(func, shared, num_threads);
			return;
		}
	
	if (thrid == 0)
	{
		p->activelevel = 1;
		
		if (num_threads == -1)
			num_threads = nthr; /* whole block */
			
		dev_set_parallel_active_threads(num_threads);
		
		if (!tasking.initialized)
			tasking_init();
	}
	
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, nthr);
	
	if (thrid < dev_get_parallel_active_threads())
		func(shared);
		
	if (thrid == 0)
		p->activelevel = 0;
		
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, nthr);
}


/**********************************************
 *                                            *
 *            MASTER-WORKER SCHEME            *
 *                                            *
 **********************************************/
 
/* 
 * The master function of the master-worker scheme. 
 */
__DEVQLFR
static void execute_master(void *(*func)(void *), void *shared, int num_threads)
{
	thread_cb_t *cb = __MYCB;
	ort_parallel_t *p = __MYPARBLOCK;
	
	if (!cb) return;
	__CHECKPARBLOCK(p);
	
	if (p->activelevel == 1)
	{
		__MYCB->in_nested_parallel = true;
		func(shared);
		__MYCB->in_nested_parallel = false;
		return;
	}
	
	/* (1) Bookkeeping for the encountered parallel region */
	p->activelevel = 1;
	if (num_threads == -1)
		num_threads = omp_get_num_threads() - warpSize;

	dev_set_parallel_active_threads(num_threads);
	p->shvars = shared;
	p->func = func;

	/* (2) Sync #1: activate the others */
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, omp_get_num_threads()); 

	/* (3) Sync #2: we are about to leave the parallel region */
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, omp_get_num_threads());
	dev_set_parallel_active_threads(1);
	p->activelevel = 0;
}


/* 
 * The worker function of the master-worker scheme.
 */
__DEVQLFR
void _dev_worker_loop(int thrid)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	while (true)
	{
		/* Wait until the master thread activates me for a
		 * serial/parallel region (sync #1)
		 */
		dev_barrier_wait(BARRIER_SLOT_DEFAULT, omp_get_num_threads());

		if (p->func == NULL) break; /* end of a target region */
		if (thrid < dev_get_parallel_active_threads())
		{
			p->func(p->shvars);
		}

		/* Wait until everyone leaves the parallel region (sync #2) */
		dev_barrier_wait(BARRIER_SLOT_DEFAULT, omp_get_num_threads());
	}
#endif
}

/**********************************************
 *                                            *
 *        END OF MASTER-WORKER SCHEME         *
 *                                            *
 **********************************************/

__DEVQLFR 
void _ort_execute_parallel(void *(*func)(void *), void *shared, int num_threads,
	                      int iscombined, int bind_req)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	if (iscombined)
		handle_combined_parallel(func, shared, num_threads);
	else
		execute_master(func, shared, num_threads);
#else
	enter_gpu_parallel_region(num_threads);

	if ((omp_get_thread_num() < num_threads) || (num_threads == -1))
		func(shared);

	leave_gpu_parallel_region(num_threads);
#endif
}

/* If-master scheme: 
 * 		_ort_execute_serial is called by all threads, but only
 *   	the master thread executes `func'.
 * Master/worker scheme: 
 * 		_ort_execute_serial is called only by the master thread.
 */
__DEVQLFR 
void _ort_execute_serial(void *(*func)(void *), void *shared)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	p->shvars = shared;
	p->func = func;

	/* Sync #1: activate the 1 waiting thread */
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, omp_get_num_threads()); 

	/* Sync #2: We are about to leave the serial region */
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, omp_get_num_threads());
#else
	if (omp_get_thread_num() == 0)
		func(shared);
	__syncthreads();
#endif
}
