/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

#include "globals.h"
#include "worksharing.h"
#include "barrier.h"
#include "parallel.h"
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>

// #define DEBUG

__DEVQLFR 
int ort_get_distribute_chunk(unsigned long niters, 
	                         unsigned long *fiter, unsigned long *liter) {

	dist_div_t      dchunksize;
	int chunksize, N = (gridDim.x*gridDim.y*gridDim.z);
	int myid = __BLOCKID;
	
#if defined(DEBUG)
	printf("NITERS [%ld]\n", niters);
#endif
	
	if (N == 1)
	{
		*fiter = 0;
		*liter = niters;
#if defined(DEBUG)
		printf("my id is %d of %d, [%ld, %ld, %ld]\n", myid, N-1, niters, *fiter, *liter);
#endif

		return (*fiter != *liter);
	}
	
	if (niters <= N)    /* less iterations than threads */
	{
		*fiter = myid;
		*liter = (myid < niters) ? myid + 1 : myid;
#if defined(DEBUG)
		printf("my id is %d of %d, [%ld, %ld, %ld]\n", myid, N-1, niters, *fiter, *liter);
#endif
		return (*fiter != *liter);
	}

	dchunksize = dist_div(niters, N);
	chunksize = dchunksize.quot;                 /* iterations in a chunk */
	niters = dchunksize.rem;
	if (niters) chunksize++;     /* first niters threads get this chunksize */

	if (myid < niters || niters == 0)       /* I get a full chunk */
	{
		*fiter = myid * chunksize;
		*liter = *fiter + chunksize;
	}
	else                                  /* I get a smaller chunk */
	{
		*fiter = niters * chunksize + (myid - niters) * (chunksize - 1);
		*liter = *fiter + (chunksize - 1);
	}
#if defined(DEBUG)
	printf("(2nd) my id is %d of %d, [%ld, %ld, %ld]\n", myid, N-1, niters, *fiter, *liter);
#endif
	return (*fiter != *liter);
}

__DEVQLFR 
int ort_get_dynamic_chunk(unsigned long niters, unsigned long chunksize,
	                      int monotonic, unsigned long *fiter, unsigned long *liter,
	                      int *ignored)
{
	int nthrtosync;
	if (chunksize <= 0) return 0;

	if (omp_get_num_threads() == 1)
	{
		*fiter = 0; /* Get just 1 chunk: all iterations */
		*liter = niters;
		return (1);
	}
	else
	{
		if (workshare.forloopiter >= niters) { *fiter = niters + 1; return (0); } /* done */
		nthrtosync = ceilf(get_parallel_active_threads() / warpSize);

		CUDA_SET_LOCK_PARTSYNC(workshare.forlooplock.mutex, 1, nthrtosync);
		*fiter = workshare.forloopiter;
		workshare.forloopiter += chunksize;
		CUDA_UNSET_LOCK_PARTSYNC(workshare.forlooplock.mutex);

		*liter = *fiter + chunksize;
		if (*liter > niters)
			*liter = niters;
	}
	return (1);
}

__DEVQLFR 
int ort_get_guided_chunk(unsigned long niters, unsigned long chunksize,
	                     int monotonic, unsigned long *fiter, unsigned long *liter,
	                     int *ignored)
{
	int ch, nthrtosync;

	if (chunksize <= 0) return 0;

	if (omp_get_num_threads() == 1)
	{
		*fiter = 0;              /* Get just 1 chunk: all iterations */
		*liter = niters;
		return (1);
	}

	if (workshare.forloopiter >= niters) { *fiter = niters + 1; return (0); } /* done */
	
	nthrtosync = ceilf(get_parallel_active_threads() / warpSize);

	CUDA_SET_LOCK_PARTSYNC(workshare.forlooplock.mutex, 1, nthrtosync);
	*fiter = workshare.forloopiter;
	ch = niters - *fiter;
	if (ch > chunksize)
	{
		ch = (ch + omp_get_num_threads() - 1) / (omp_get_num_threads());
		if (ch < chunksize)
			ch = chunksize;
	}
	workshare.forloopiter += ch;
	CUDA_UNSET_LOCK_PARTSYNC(workshare.forlooplock.mutex);

	*liter = *fiter + ch;
	return (ch != 0);
}

__DEVQLFR 
int ort_get_static_default_chunk(unsigned long niters, 
	                             unsigned long *fiter, unsigned long *liter) {
    dist_div_t      dchunksize;
	int chunksize, N, myid = __THRID;
	
	if (!(N = get_parallel_active_threads()))
		N = __NTHR;
		
#if defined(DEBUG)
	printf("STATIC - NITERS [%ld]\n", niters);
#endif
	
	if (N == 1)
	{
		*fiter = 0;
		*liter = niters;
#if defined(DEBUG)
		printf("STATIC - my id is %d - N is %d - my loop [%ld, %ld, %ld]\n", myid, N, niters, *fiter, *liter);
#endif
		return (*fiter != *liter);
	}
	
	if (niters <= N)    /* less iterations than threads */
	{
		*fiter = myid;
		*liter = (myid < niters) ? myid + 1 : myid;
#if defined(DEBUG)
		printf("STATIC - my id is %d - N is %d - my loop [%ld, %ld, %ld]\n", myid, N, niters, *fiter, *liter);
#endif
		return (*fiter != *liter);
	}

	dchunksize = dist_div(niters, N);
	chunksize = dchunksize.quot;                 /* iterations in a chunk */
	niters = dchunksize.rem;
	if (niters) chunksize++;     /* first niters threads get this chunksize */

	if (myid < niters || niters == 0)       /* I get a full chunk */
	{
		*fiter = myid * chunksize;
		*liter = *fiter + chunksize;
	}
	else                                  /* I get a smaller chunk */
	{
		*fiter = niters * chunksize + (myid - niters) * (chunksize - 1);
		*liter = *fiter + (chunksize - 1);
	}
	
#if defined(DEBUG)
	printf("STATIC - my id is %d - N is %d - my loop [%ld, %ld, %ld]\n", myid, N, niters, *fiter, *liter);
#endif
	return (*fiter != *liter);
}

/* 
 * chunkid MUST be initialy equal to 0.
 */
__DEVQLFR
int ort_get_runtimestatic_chunk(unsigned long niters, unsigned long chunksize, int monotonic,
                                unsigned long *fiter, unsigned long *liter, int *chunkid)
{
	int nthr, myid = __THRID;
	
	if (!(nthr = get_parallel_active_threads()))
		nthr = __NTHR;
	
	if (nthr == 1)
	{
		if (*chunkid >= 0) { *fiter = niters + 1; return (0); } /* Only 1 chunk */
		*chunkid = 1;
		*fiter = 0;                    /* Get just 1 chunk: all iterations */
		*liter = niters;
		return (1);
	}

	if (chunksize == 0)  /* No chunksize given */
	{
		if (*chunkid == 1) { *fiter = niters + 1; return (0); } /* Only 1 chunk */
		*chunkid = 1;
		return ( ort_get_static_default_chunk(niters, fiter, liter) );
	}
	else                 /* chunksize given */
	{
		if (chunksize == 0) 
		{
			printf("fatal: runtime chunksize is 0\n");
			return 0;
		}
		if (*chunkid < 0)    /* my very first chunk */
			*chunkid = myid;
		else
			(*chunkid) += nthr;
		*fiter = chunksize * (*chunkid);
		if (*fiter >= niters)
			return (0);
		*liter = *fiter + chunksize;
		if (*liter > niters)
			*liter = niters;
		return (1);
	}
}

#if defined(DEBUG)
#undef DEBUG
#endif
