/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* parallel.c
 * Handle parallel regions within a CUDA kernel
 */

#include <stdio.h>
#include "globals.h"
#include "locks.h"
#include "parallel.h"
#include "worksharing.h"
#include "barrier.h"
#include "tasks.h"

/* Parallel control variables */
#if PARALLEL_MEMTYPE == MEMTYPE_SHARED
    __SHAREDQLFR ort_parallel_t parblock;
#else
    __DEVQLFR ort_parallel_t parblocks[MAX_PARALLEL_BLOCKS];
#endif

__DEVQLFR
int get_parallel_active_threads(void)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	return p->execthr;
}

__DEVQLFR
void set_parallel_active_threads(int num_threads)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	p->execthr = num_threads;
}

#if PARALLEL_SCHEME == SCHEME_IFMASTER
__DEVQLFR 
static void enter_gpu_parallel_region(int num_threads)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	if (omp_get_thread_num() == 0)
	{
		p->activelevel = 1;
		p->execthr = (num_threads == -1) 
			? omp_get_num_threads() 
			: num_threads;
	}
		
	__syncthreads();
}

__DEVQLFR
static void leave_gpu_parallel_region(int num_threads)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	if (omp_get_thread_num() == 0)
	{
		p->execthr = 1;
        p->activelevel = 0;
	}

	__syncthreads();
}
#endif

__DEVQLFR 
static void combined_parallel_exec(void *(*func)(void *), void *shared, int num_threads)
{
	thread_cb_t *cb = __MYCB;
	ort_parallel_t *p = __MYPARBLOCK;
	int myid = omp_get_thread_num(), nthr = omp_get_num_threads();
	
	__CHECKPARBLOCK(p);
	if (!cb) return;
	
	if (p->activelevel == 1)
	{
		__MYCB->in_nested_parallel = true;
		func(shared);
		__MYCB->in_nested_parallel = false;
		return;
	}
	else if (__MYCB->intask)
	{
		func(shared);
		return;
	}
	
	/* Combined parallel regions do not rely on master/worker
	codegen, thus every thread will enter the function */
	if (myid == 0)
	{
		p->activelevel = 1;
		
		if (num_threads == -1)
			num_threads = nthr; /* whole block */
			
		set_parallel_active_threads(num_threads);
	}
	
	cudadev_namedbar_sync(0, nthr);
	
	if (myid < get_parallel_active_threads())
		func(shared);
		
	if (myid == 0)
		p->activelevel = 0;
		
	cudadev_namedbar_sync(0, nthr);
}

/**********************************************
 *                                            *
 *            MASTER-WORKER SCHEME            *
 *                                            *
 **********************************************/

/* 
 * The master function of the master-worker scheme. 
 */
__DEVQLFR 
static void cudadev_master_func(void *(*func)(void *), void *shared, int num_threads)
{
	thread_cb_t *cb = __MYCB;
	ort_parallel_t *p = __MYPARBLOCK;
	if (!cb) return;
	
	__CHECKPARBLOCK(p);
	
	if (p->activelevel == 1)
	{
		__MYCB->in_nested_parallel = true;
		func(shared);
		__MYCB->in_nested_parallel = false;
		return;
	}
	else if (__MYCB->intask)
	{
		func(shared);
		return;
	}
	
	/* (1) Bookkeeping for the encountered parallel region */
	p->activelevel = 1;
	if (num_threads == -1)
		num_threads = omp_get_num_threads() - warpSize;

	set_parallel_active_threads(num_threads);
	p->shvars = shared;
	p->func = func;

	/* (2) Sync #1: activate the others */
	cudadev_namedbar_sync(0, omp_get_num_threads()); 

	/* (3) Sync #2: we are about to leave the parallel region */
	cudadev_namedbar_sync(0, omp_get_num_threads());
	set_parallel_active_threads(1);
	p->activelevel = 0;
}

/* 
 * The worker function of the master-worker scheme.
 */
__DEVQLFR
void cudadev_worker_func(int thrid)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);

	while (true)
	{
		/* Wait until the master thread activates me for a
		 * serial/parallel region (sync #1)
		 */
		cudadev_namedbar_sync(0, omp_get_num_threads());

		if (p->func == NULL) break; /* end of a target region */
		if (thrid < get_parallel_active_threads())
		{
			p->func(p->shvars);
		}

		/* Wait until everyone leaves the parallel region (sync #2) */
		cudadev_namedbar_sync(0, omp_get_num_threads());
	}
#endif
}

/**********************************************
 *                                            *
 *        END OF MASTER-WORKER SCHEME         *
 *                                            *
 **********************************************/


__DEVQLFR 
void ort_execute_parallel(void *(*func)(void *), void *shared, int num_threads,
	                      int iscombined, int bind_req)
{
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	if ((iscombined) || (__PARBLOCK.activelevel == 1))
		combined_parallel_exec(func, shared, num_threads);
	else
		cudadev_master_func(func, shared, num_threads);
#else
	enter_gpu_parallel_region(num_threads);

	if ((omp_get_thread_num() < num_threads) || (num_threads == -1))
		func(shared);

	leave_gpu_parallel_region(num_threads);
#endif
}

/* If-master scheme: 
 * 		ort_execute_serial is called by all threads, but only
 *   	the master thread executes `func'.
 * Master/worker scheme: 
 * 		ort_execute_serial is called only by the master thread.
 */
__DEVQLFR 
void ort_execute_serial(void *(*func)(void *), void *shared)
{
	ort_parallel_t *p = __MYPARBLOCK;
	__CHECKPARBLOCK(p);
	
#if PARALLEL_SCHEME == SCHEME_MASTERWORKER
	p->shvars = shared;
	p->func = func;

	/* Sync #1: activate the 1 waiting thread */
	cudadev_namedbar_sync(0, omp_get_num_threads()); 

	/* Sync #2: We are about to leave the serial region */
	cudadev_namedbar_sync(0, omp_get_num_threads());
#else
	if (omp_get_thread_num() == 0)
		func(shared);
	__syncthreads();
#endif
}
