/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* parallel.c
 * Handle parallel regions within a CUDA kernel
 */

#include <stdio.h>
#include "globals.h"
#include "locks.h"
#include "parallel.h"
#include "worksharing.h"
#include "barrier.h"

/* Parallel control variables */
#if PARALLEL_MEMTYPE == MEMTYPE_SHARED
    __shared__ ort_parallel_t block_parallel_cb;
#else
    __device__ ort_parallel_t block_parallel_cbs[MAX_PARALLEL_BLOCKS];
#endif

__device__
int dev_get_parallel_active_threads(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	if (p->activelevel == 0 && shdata.inmasterworker)
		return 1;

	return (p->activelevel > 0) ? p->execthr : __NTHR;
}

__device__
static void dev_set_parallel_active_threads(int num_threads)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	p->execthr = num_threads;
}

__device__
void dev_set_active_level(int level)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	p->activelevel = level;
}


#if PARALLEL_SCHEME == SCHEME_IFMASTER
__device__ 
static void enter_gpu_parallel_region(int num_threads)
{
	if (omp_get_thread_num() == 0)
	{
		dev_set_active_level(1);
		dev_set_parallel_active_threads((num_threads == -1) 
			? __NTHR : num_threads);
	}
		
	__syncthreads();
}


__device__ 
static void leave_gpu_parallel_region(int num_threads)
{
	if (omp_get_thread_num() == 0)
	{
		dev_set_active_level(0);
		dev_set_parallel_active_threads(1);
	}

	__syncthreads();
}
#endif


/* Combined parallel regions, entered by all threads. */
__device__
static void handle_combined_parallel(void *(*func)(void *), void *shared, int num_threads)
{
	thread_cb_t *cb = __MYCB;
	ort_parallel_t *p = &(__PARBLOCK);
	int thrid = __THRID, nthr = __NTHR;
	
	if (!cb) return;
	__CHECKPARBLOCK(p);
	
	if (p->activelevel == 1)
	{
		__MYCB->in_nested_parallel = true;
		(*func)(shared);
		__MYCB->in_nested_parallel = false;
		return;
	}
	else if (cb->intask)
	{
		(*func)(shared);
		return;
	}
	
	if (thrid == 0)
	{
		p->activelevel = 1;
		p->combined = true;

		if (num_threads == -1)
			num_threads = nthr; /* whole block */
			
		dev_set_parallel_active_threads(num_threads);
	}
	
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, nthr);
	
	if (thrid < dev_get_parallel_active_threads())
		(*func)(shared);
		
	if (thrid == 0)
		p->activelevel = 0;
		
	dev_barrier_wait(BARRIER_SLOT_DEFAULT, nthr);
}


__device__ void dev_init_block_parallel_cb(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	p->activelevel = 0;
	p->combined = false;
}


/**********************************************************************
 *                                                                    *
 *                       MASTER-WORKER SCHEME                         *
 *                                                                    *
 **********************************************************************
 *
 * In this scheme, we always create a fixed number of 128 threads.
 * The last warp [96..127] acts as the master warp. In this specific 
 * warp, only the warp master (threadID = 96, aka master thread) is 
 * active; the remaining threads stay completely inactive. The master 
 * thread assigns parallel regions to the workers (see below), but 
 * does NOT execute any of those regions itself. 
 * 
 * The remaining warps [0..31], [32..63], and [64..95] serve as worker 
 * warps. Threads in these warps (96 in total) are responsible for 
 * executing the parallel regions assigned by the master thread.
 * 
 * These scheme is activated in the case of non-combined #target regions
 * that may contain multiple #parallel regions. If no #parallel regions 
 * exist, this scheme is activated again, and only thread 0 participates,
 * by executing the code serially.
 * 
 * If the user uses a `num_threads(X)' clause, then X out of 96 threads 
 * are used, with the remaining ones staying inactive.
 */

/*
 * This function is called by the master thread upon exiting
 * a target region. It basically resets the control variables
 * for the parallel regions.
 */
__device__
void _cuda_dev_exit_target_region(void)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	p->func = NULL;
	p->shvars = NULL;
	__syncthreads();
#endif
}


/* 
 * The master function of the master-worker scheme. This function is executed ONLY
 * by the master thread (threadID = 96).
 */
__device__
static void execute_master(void *(*func)(void *), void *shared, int num_threads)
{
	thread_cb_t *cb = __MYCB;
	ort_parallel_t *p = __MYPARBLOCK;
	
	if (!cb) return;
	__CHECKPARBLOCK(p);
	
	if (p->activelevel == 1)
	{
		__MYCB->in_nested_parallel = true;
		(*func)(shared);
		__MYCB->in_nested_parallel = false;
		return;
	}
	
	/* (1) Bookkeeping for the encountered parallel region */
	dev_set_active_level(1);
	p->activelevel = 1;
	if (num_threads == -1)
		num_threads = __NTHR - warpSize;

	dev_set_parallel_active_threads(num_threads);
	p->shvars = shared;
	p->func = func;

	/* (2) Sync #1: activate the others */
	__syncthreads(); 

	/* (3) Sync #2: we are about to leave the parallel region */
	__syncthreads();
	dev_set_parallel_active_threads(1);
	dev_set_active_level(0); /* Back to serial execution */
}


/* 
 * The worker function of the master-worker scheme. This function is executed
 * ONLY by worker threads (0 <= threadID <= min(num_threads, 95)).
 */
__device__
void _cuda_dev_worker_loop(int thrid)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	/* (1) Start of a target region */
	while (true)
	{
		/* (2) Sync #1: Wait until the master thread activates me
		 *     for a serial/parallel region
		 */
		__syncthreads();

		if (p->func == NULL) break; /* End of a target region */
		if (thrid < dev_get_parallel_active_threads())
			p->func(p->shvars);

		/* (3) Sync #2: Wait until everyone leaves the parallel region */
		__syncthreads();
	}
#endif
}


/**********************************************
 *                                            *
 *        END OF MASTER-WORKER SCHEME         *
 *                                            *
 **********************************************/

__device__ 
void _ort_execute_parallel(void *(*func)(void *), void *shared, int num_threads,
                           int iscombined, int bind_req)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	if (iscombined)
		handle_combined_parallel(func, shared, num_threads);
	else
		execute_master(func, shared, num_threads);
#else
	enter_gpu_parallel_region(num_threads);

	if ((omp_get_thread_num() < num_threads) || (num_threads == -1))
		func(shared);

	leave_gpu_parallel_region(num_threads);
#endif
}

/* If-master scheme: 
 * 		_ort_execute_serial is called by all threads, but only
 *   	the master thread executes `func'.
 * Master/worker scheme: 
 * 		_ort_execute_serial is called only by the master thread.
 */
__device__ 
void _ort_execute_serial(void *(*func)(void *), void *shared)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	p->shvars = shared;
	p->func = func;

	/* Sync #1: activate the 1 waiting thread */
	__syncthreads();

	/* Sync #2: We are about to leave the serial region */
	__syncthreads();
#else
	if (omp_get_thread_num() == 0)
		func(shared);
	__syncthreads();
#endif
}
