/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* oclgpudev.c -- thin layer above OpenCL utils to handle gpu devices */

/* Define for debugging */
//#define DBGPRN_FORCE
//#define ORT_DEBUG_LEVEL 1

#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <errno.h>
#include "oclgpudev.h"
#include "rt_common.h"

#if ORT_DEBUG_LEVEL > 0
	#include <sys/time.h>
	#define SECPASSED(t2,t1) \
            (((t2).tv_sec - (t1).tv_sec) + ((t2).tv_usec - (t1).tv_usec)*1.0e-6)
#endif

/* Some hard-coded values to make code work on older hardware; 
 * taken form the Kronos OpenCL 3.0 header.
 */
/* 2.0 */
#ifndef CL_DEVICE_SVM_CAPABILITIES
	#define CL_DEVICE_SVM_CAPABILITIES 0x1053
	typedef cl_bitfield cl_device_svm_capabilities;
	#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0)
	#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER   (1 << 1)
	#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM   (1 << 2)
	#define CL_DEVICE_SVM_ATOMICS             (1 << 3)
#endif
/* 3.0 */
#ifndef CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT
	#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068
#endif
#ifndef CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT
	#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT 0x1069
#endif


int ocl_num_gpus;
static ocl_gpu_t *ocl_gpus;
static void *_oclgpu_lock;


static int is_big_endian(void)
{
	union
	{
		uint32_t i;
		char c[4];
	} e = { 0x01000000 };
	return e.c[0];
}


static int is_64bit(void)
{
	return (sizeof(void *) == 8);
}


enum xlb_e ocl_xceed_limits_behavior = XLB_WARN;
bool ocl_use_host_ptr = false;


/* If full is false then the initialization is simply for getting 
 * info about the devices; otherwise, we are on full hm_initialize().
 */
void oclgd_init(bool full)
{
	char *s;
	
	if (ocl_gpus) return;
	if (ocl_num_gpus <= 0)
		ocl_num_gpus = ocl_discover_num_gpus(-1);
	ocl_gpus = (ocl_gpu_t *) calloc(ocl_num_gpus, sizeof(ocl_gpu_t));
	if (full)
		init_lock(&_oclgpu_lock, ORT_LOCK_SPIN);
	
	/* TODO: we need a mechanism to inform the user about these envirnmental 
	 * variables and their possible values.
	 * E.g. we could have a function call like:
	 *   env_var(name, space, possibleValues, defaultValue, description);
	 * where space = MODULE | EELIB | RUNTIME
	 *       possibleValues is a string with the possible values separated by '|'
	 *       description could be a larger string that explains the variable 
	 *       and the values and could be shown only if the user asks for 
	 *       detailed information.
	 * env_var() would be the central bookeeper of all OMPi's environmental
	 *       variables descriptions to be displaye to the user.
	 */
	if ((s = getenv("OMPI_MODULE_OPENCL_XLIMIT")) != NULL)
	{
		if (strcmp(s, "warn") == 0)
			ocl_xceed_limits_behavior = XLB_WARN;
		else
			if (strcmp(s, "nowarn") == 0)
				ocl_xceed_limits_behavior = XLB_NOWARN;
			else
				if (strcmp(s, "exit") == 0)
					ocl_xceed_limits_behavior = XLB_EXIT;
				else
					if (strcmp(s, "exitsilent") == 0)
						ocl_xceed_limits_behavior = XLB_EXITMUTE;
	}
	if ((s = getenv("OMPI_MODULE_OPENCL_ZEROCOPY")) != NULL)
		if (strcmp(s, "yes") == 0 || strcmp(s, "true") == 0)
			ocl_use_host_ptr = true;
}


void oclgd_finish()
{
	int i;
	
	if (!ocl_gpus) return;
	for (i = 0; i < ocl_num_gpus; i++)
	{
		free(ocl_gpus[i].device_name);
		free(ocl_gpus[i].device_vendor);
		free(ocl_gpus[i].device_driver_version);
	}
	free(ocl_gpus);
	ocl_gpus = NULL;
	ocl_num_gpus = 0;
}


ocl_gpu_t *oclgd_get(int dev_id)
{
	if (!ocl_gpus || dev_id < 0 || dev_id > ocl_num_gpus) return NULL;
	return &(ocl_gpus[dev_id]);
}


/* Check if a particular OpenCL major version is supported; 
 * if majver is <= 0, the maximum supported major version is returned.
 */
int ocl_opencl_c_majver_supported(ocl_gpu_t *dev, int majver)
{
	int i, max = 0;
	
	if (majver <= 0) majver = 0;   /* force comparison failures */
	for (i = 0; i < dev->num_cvers; i++)
		if (VERMAJ(dev->cvers[i]) == majver)
			return 1;
		else
			if (VERMAJ(dev->cvers[i]) > max)
				max = VERMAJ(dev->cvers[i]);
	return (majver > 0) ? 0 : max;
}


/* Gather OpenCL info and fill our device fields. 
 * The device has already been checked if it fullfils the requirements.
 * Returns != 0 upon error.
 *
 * Possibly useful infos for the future:
 *   CL_DEVICE_ADDRESS_BITS (= 32 or 64)
 *   CL_DEVICE_MAX_MEM_ALLOC_SIZE ( = min(1/4 GlobalMemSize, 1GB) or 32MB )
 *   CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR & Kernel Clock Functions
 *   CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT
 */
int oclgd_prepare(ocl_gpu_t *dev, cl_device_id cldev, cl_platform_id clplat)
{
	char     devinfo[OCL_QUERY_SIZE];
	cl_uint  dev_uint;
	cl_bool  dev_bool;
	cl_int   err = 0;
	cl_device_svm_capabilities svm_caps;

	dev->clplatform = clplat;
	dev->cldevice = cldev;

	/* Platform and device (hardware) versions */
	clGetPlatformInfo(clplat, CL_PLATFORM_VERSION, OCL_QUERY_SIZE, devinfo, NULL);
	dev->pver = ocl_version_from_string(devinfo + 7);
	clGetDeviceInfo(cldev, CL_DEVICE_VERSION, OCL_QUERY_SIZE, devinfo, NULL);
	dev->dver = ocl_version_from_string(devinfo + 7);
	/* OpenCL C versions supported; this needs work depending on the version */
	ocl_openclc_versions(cldev, dev->cvers, &dev->num_cvers);
	/* Driver version - vendor specific (useless for us) */
	clGetDeviceInfo(cldev, CL_DRIVER_VERSION, OCL_QUERY_SIZE, devinfo, NULL);
	dev->device_driver_version = strdup(devinfo);

	clGetDeviceInfo(cldev, CL_DEVICE_MAX_COMPUTE_UNITS,
	                sizeof(cl_uint), &(dev->device_max_computeinits), NULL);
	clGetDeviceInfo(cldev, CL_DEVICE_MAX_WORK_GROUP_SIZE,
	                sizeof(size_t), &(dev->device_team_max_threads), NULL);
	clGetDeviceInfo(cldev, CL_DEVICE_GLOBAL_MEM_SIZE,
	                sizeof(cl_ulong), &(dev->device_global_mem_size), NULL);
	clGetDeviceInfo(cldev, CL_DEVICE_NAME, OCL_QUERY_SIZE, devinfo, NULL);
	dev->device_name = strdup(devinfo);
	clGetDeviceInfo(cldev, CL_DEVICE_LOCAL_MEM_SIZE,
	                sizeof(cl_ulong), &(dev->device_local_mem_size), NULL);
	clGetDeviceInfo(cldev, CL_DEVICE_VENDOR, OCL_QUERY_SIZE, devinfo, NULL);
	dev->device_vendor = strdup(devinfo);
	clGetDeviceInfo(cldev, CL_DEVICE_DOUBLE_FP_CONFIG,
		              sizeof(cl_device_fp_config), &(dev->doublesupport), NULL);
	if (strstr(dev->device_vendor, "Advanced Micro Devices") != NULL)
		dev->device_optimal_team_threads = 64;
	else
		dev->device_optimal_team_threads = 32; /* Maybe this should be = max? */
	dev->svm_coarse = dev->svm_fine_buffer = dev->svm_fine_system = 0;
	if (clGetDeviceInfo(cldev, CL_DEVICE_SVM_CAPABILITIES,
	                    sizeof(svm_caps), &svm_caps, NULL) == CL_SUCCESS)
	{
		dev->svm_coarse      = (svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
		dev->svm_fine_buffer = (svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
		dev->svm_fine_system = (svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
	}
	if (clGetDeviceInfo(cldev, CL_DEVICE_HOST_UNIFIED_MEMORY,
	                    sizeof(cl_bool), &dev_bool, NULL) == CL_SUCCESS)
		dev->unified_memory_12 = dev_bool;

	if (clGetDeviceInfo(cldev, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, 
	                    sizeof(cl_bool), &dev_bool, NULL) == CL_SUCCESS)
		dev->haswgcollectives = dev_bool;
	if (clGetDeviceInfo(cldev, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, 
	                    sizeof(cl_bool), &dev_bool, NULL) == CL_SUCCESS)
		dev->hasgenericmemspace = dev_bool;
	if (clGetDeviceInfo(cldev, CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool),
	                    &dev_bool, NULL) == CL_SUCCESS)
		if ((dev_bool == CL_TRUE && is_big_endian()) ||
		    (dev_bool == CL_FALSE && !is_big_endian()))
			fprintf(stderr, "[opencl module]: warning: GPU %ld has different "
			                "endianness from the CPU\n", dev - ocl_gpus);
	if (clGetDeviceInfo(cldev, CL_DEVICE_ADDRESS_BITS,
	                    sizeof(cl_uint), &dev_uint, NULL) == CL_SUCCESS)
		if ((dev_uint == 32 && is_64bit()) || (dev_uint == 64 && !is_64bit()))
			fprintf(stderr, "[opencl module]: warning: GPU %ld has different memory "
			                "address width from the CPU\n", dev - ocl_gpus);

	/* Create a compute context */
	dev->context = clCreateContext(0, 1, &(dev->cldevice), NULL, NULL, &err);
	if (!dev->context || err)
	{
		fprintf(stderr, "[opencl module]: warning: cannot create compute context "
		                "for GPU %ld (%s)\n", dev - ocl_gpus, ocl_errstr(err));
		FREEMEM:
		free(dev->device_name);
		free(dev->device_vendor);
		free(dev->device_driver_version);
		return -1;
	}

	/* Create a command queuel; if needed enable profiling */
#if ORT_DEBUG_LEVEL > 0
	{
		struct timeval t1, t2;
		gettimeofday(&t1, NULL);

		cl_command_queue_properties props[3]= {
					CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0
				};
		dev->commands = clCreateCommandQueueWithProperties(dev->context,
		                          dev->cldevice, props, &err);
	
		gettimeofday(&t2, NULL);
		DBGPRN1((stderr, "\t[%s] : \n", __func__));
		DBGPRN1((stderr, "\t\tclCreateCommandQueueWithProperties time: %.3lf sec\n", 
		                SECPASSED(t2,t1)));
	}
#else
	dev->commands = clCreateCommandQueueWithProperties(dev->context,
	                          dev->cldevice, NULL, &err);
#endif

	if (!dev->commands || err)
	{
		fprintf(stderr, "[opencl module]: warning: cannot create command qeueue "
		                "for GPU %ld (%s)\n", dev - ocl_gpus, ocl_errstr(err));
		clReleaseContext(dev->context);
		goto FREEMEM;
	}
	
	dev->devrt = NULL;
	dev->pcentries = 0;
	return 0;
}


#define PCACHE_MODSUB(x,c) (((x)+OCL_PCACHE_SIZE-(c)) % OCL_PCACHE_SIZE)

/**
 * Find and return a cached program object.
 * @param gpu the device
 * @param hostfunc the host function (caching is indexed on this)
 * @return the progam object or NULL if not found
 */
ocl_program_t *
oclgd_get_cached_program(ocl_gpu_t *gpu, void *(*hostfunc)(void *))
{
	int i;

	if (gpu->pcentries <= OCL_PCACHE_SIZE)
	{
		for (i = gpu->pcentries - 1; i >= 0; i--)
			if (gpu->pcache[i].hostfunc == hostfunc)
				return &gpu->pcache[i];
	}
	else
	{
		for (i = 1; i <= OCL_PCACHE_SIZE; i++)
			if (gpu->pcache[ PCACHE_MODSUB(gpu->pcentries, i) ].hostfunc == hostfunc)
				return &gpu->pcache[ PCACHE_MODSUB(gpu->pcentries, i) ];
	}
	return NULL;
}


/**
 * Adds a program object to the device cache.
 * @param gpu the device
 * @param hostfunc the host function (caching is indexed on this)
 * @param filepath the kernel file path the program was created from.
 */
ocl_program_t *oclgd_add_cached_program(ocl_gpu_t *gpu, void *(*hf)(void *),
                                        char *fp, char *sfp, cl_program pg)
{
	int e;
	
	lock(&_oclgpu_lock);
	
	e = gpu->pcentries % OCL_PCACHE_SIZE;
	if (gpu->pcentries <= OCL_PCACHE_SIZE)      /* Initialize program lock */
		init_lock(&gpu->pcache[gpu->pcentries].lock, ORT_LOCK_NORMAL);
	else                             /* Release resources before replacing */
	{
		free(gpu->pcache[e].filepath);
		free(gpu->pcache[e].filepath_sources); 
		clReleaseProgram(gpu->pcache[e].program);
	}
	gpu->pcache[e].hostfunc = hf;
	gpu->pcache[e].filepath = fp;
	gpu->pcache[e].filepath_sources = sfp;
	gpu->pcache[e].program = pg;
	gpu->pcentries++;
	
	unlock(&_oclgpu_lock);
	return &gpu->pcache[e];
}


void olcgd_close(ocl_gpu_t *dev)
{
	int i;
	
	clReleaseCommandQueue(dev->commands);
	dev->commands = NULL;
	clReleaseContext(dev->context);
	dev->context = NULL;
	
	/* Clear the pogram cache */
	for (i = 0; i < dev->pcentries; i++)
	{
		free(dev->pcache[i].filepath);
		free(dev->pcache[i].filepath_sources); 
		clReleaseProgram(dev->pcache[i].program);
		dev->pcache[i].filepath = NULL;
		dev->pcache[i].filepath_sources=NULL;
		dev->pcache[i].program = NULL;
		/* Should also free the lock */
	}
}


cl_program oclgd_load_devpart(ocl_gpu_t *gpu, char *libdirpath, char *MALflavor)
{
	cl_int     err;
	cl_program program;                 // compute program
	char      *binary = NULL;
	size_t     readbytes, filesize = 0;
	char      *libpath;
	int        libfd;

	if (gpu->devrt)
		return (gpu->devrt);

	/* Get the binary */
	libpath = (char *) malloc(strlen(libdirpath) + strlen(MALflavor)+1 + 35);
	if (*MALflavor)
		sprintf(libpath, "%s/opencl/%s-devpart.clbin.%ld", libdirpath, MALflavor,
		                 gpu - ocl_gpus);
	else
		sprintf(libpath, "%s/opencl/devpart.clbin.%ld", libdirpath, gpu - ocl_gpus);
	if ((libfd = open(libpath, O_RDONLY)) < 0)
	{
		fprintf(stderr, "Failed to open opencl devpart (%s)\n", libpath);
		return (NULL);
	}
	free(libpath);

	filesize = lseek(libfd, 0, SEEK_END);      /* Find binary size */
	lseek(libfd, 0, SEEK_SET);                 /* Rewind... */
	binary = (char *) calloc(filesize + 1, sizeof(char)); /* Allocate memory */
	readbytes = read(libfd, binary, filesize); /* Read file */
	if (readbytes != filesize)
	{
		fprintf(stderr, "Failed to read devpart.clbin.%ld\n", gpu - ocl_gpus);
		free(binary);
		return NULL;
	}
	close(libfd);

	/* Create program */
	program = clCreateProgramWithBinary(gpu->context, 1, &(gpu->cldevice),
	                     &filesize, (const unsigned char **) &binary, NULL, &err);
	free(binary);
	if (err != CL_SUCCESS)
	{
		fprintf(stderr, "Failed to create the library program\n\t%s\n",
		        ocl_errstr(err));
		return NULL;
	}
	
	return (gpu->devrt = program);
}


/* Because of OpenCL C restrictions, exploiting local memory is very difficult.
 * To be able to exploit it, we take a large portion of local memory and
 * manage this portion by ourselves. This portion can be obtained in two 
 * ways: a) by declaring a __local uchar array within the kernel function or 
 * b) by creating a buffer object in local memory and passing it to the 
 * kernel function; in the latter case we pass it as argument 0. 
 */
#if OCL_PASS_LOCALMEM_AS_KERNEL_ARG
	#define ARGSTART 2
	
	/* This creates a local memory buffer and passes it as the first (0)
	 * argument to the kernel 
	 */
	static void passlocalmem(ocl_gpu_t *dev, cl_kernel kernel)
	{
		cl_int status = clSetKernelArg(kernel, 0, sizeof(cl_ulong)*1024, NULL);
		if (status != CL_SUCCESS)
		{
			fprintf(stderr, "\t%s failed!!\n", __func__);
			exit(1);
		}
	}
#else
	#define ARGSTART 1
	#define passlocalmem(a,b) 
#endif


/* Pass a buffer with extra info for the devpart */
static void passxtrainfo(ocl_gpu_t *dev, cl_kernel kernel, cl_mem xtra)
{
	cl_int status = clSetKernelArg(kernel, ARGSTART-1, sizeof(cl_mem), &xtra);
	if (status != CL_SUCCESS)
	{
		fprintf(stderr, "\t%s failed (%s)\n", __func__, ocl_errstr(status));
		exit(1);
	}
	DBGPRN1((stderr, "\t\t>> arg #%d is xtrainfo\n",ARGSTART-1));
}


void checkArgStatus(ocl_gpu_t *dev, cl_int status, char *msg, int argnum)
{
	if (status == CL_SUCCESS)
		return;

#ifdef OCL_USE_CL2_SVM_ARGS
	if (dev->svm_coarse)
		fprintf(stderr, "\t[opencl]: %s failed on passing %s kernel argument",
	                "clSetKernelArgSVMPointer()", msg);
	else
#endif
		fprintf(stderr, "\t[opencl]: %s failed on passing %s kernel argument",
	                "clSetKernelArg()", msg);
	if (argnum < 0)
		fprintf(stderr, ": %s\n", ocl_errstr(status));
	else
		fprintf(stderr, " #%d: %s\n", argnum, ocl_errstr(status));
	exit(1);
}


/* Read arguments from pointer list and pass them to the OpenCL kernel */
void oclgd_pass_args_to_kernel(ocl_gpu_t *dev, cl_kernel kernel, 
                               int *num_args, void **args, cl_mem xtrainfo)
{
	int i, argid = ARGSTART, offset = 0;
	cl_int status;
	void *ptr_var, *nullptr = NULL;

	// TODO: Maybe add some error checking for `num_args`'s length here?
	int ndeclargs   = num_args[0]; // DECLARE_ARGS
	int nfpargs     = num_args[1]; // FIRSTPRIV_ARGS
	int nmappedargs = num_args[2]; // MAPPED_ARGS

	DBGPRN1((stderr, "\t[%s] :\n", __func__));

	/* Start with the dev_data pointer which is ignored (=> NULL) */
#ifdef OCL_USE_CL2_SVM_ARGS
	if (dev->svm_coarse)
		status = clSetKernelArgSVMPointer(kernel, argid++, NULL);
	else
#endif
		status = clSetKernelArg(kernel, argid++, sizeof(cl_mem), &nullptr);
	DBGPRN1((stderr, "\t\t>> arg #%d is always NULL\n", argid-1));
	checkArgStatus(dev, status, "dev_data", -1);

	/* Declare target and firstprivate variables handling */
	for (i = 0; i < ndeclargs + nfpargs; ++i)
	{
		ptr_var = args[offset++];
#ifdef OCL_USE_CL2_SVM_ARGS
		if (dev->svm_coarse)
			status = clSetKernelArgSVMPointer(kernel, argid++, ptr_var);
		else
#endif
			status = clSetKernelArg(kernel, argid++, sizeof(cl_mem), &ptr_var);
		checkArgStatus(dev, status, i < ndeclargs ? "decl" : "fip", 
		                            i < ndeclargs ? i : i-ndeclargs);
		DBGPRN1((stderr, "\t\t>> arg #%d is %s arg %d (%p)\n", argid-1,
		                 i < ndeclargs ? "decl" : "fip", 
		                 i < ndeclargs ? i : i-ndeclargs, ptr_var));
	}
	
	/* Pass the mapped variables */
	if (nmappedargs)
		for (i = 0; i < nmappedargs; ++i, offset++, argid++)
		{
			ptr_var = args[offset];
			if (i % 2)   /* offset */
				status = clSetKernelArg(kernel, argid, sizeof(cl_ulong), &ptr_var);
			else
			{
#ifdef OCL_USE_CL2_SVM_ARGS
				if (dev->svm_coarse)
					status = clSetKernelArgSVMPointer(kernel, argid, ptr_var);
				else
#endif
					status = clSetKernelArg(kernel, argid, sizeof(cl_mem), &ptr_var);
			}
			checkArgStatus(dev, status, "mapped", i);
			DBGPRN1((stderr, "\t\t>> arg #%d is normal %s %d (%p)\n", argid,
		                 i % 2 ? "offset" : "arg", 
		                 i / 2, i % 2 ? "" : ptr_var));
		};

	passxtrainfo(dev, kernel, xtrainfo);
	passlocalmem(dev, kernel);   /* Create local memory buffer if needed */

	DBGPRN1((stderr, "\t\t>> all mappings done; about to execute the kernel\n"));
}


/* Fix thread limits and/or give default values if needed, for the given kernel.
 */
int ocldg_setup_threading(ocl_gpu_t *dev, cl_kernel kernel, 
                         int *numthr, int *thrlim)
{
	int num_threads = *numthr, thread_limit = *thrlim;

	/* Well, this should normally play with the device ICVs... */
	if (thread_limit <= 0)
		thread_limit = dev->device_team_max_threads;
	if (num_threads < 0)
		if (clGetKernelWorkGroupInfo(kernel, dev->cldevice, 
		                             CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), 
		                             &num_threads, NULL) != CL_SUCCESS)
			num_threads = dev->device_optimal_team_threads;
	if (num_threads == 0)
		num_threads = 1;
	if (num_threads > thread_limit)  /* both positive; we can compare */
		num_threads = thread_limit;

	if (num_threads > 0 && num_threads > dev->device_team_max_threads)
	{
		if (ocl_xceed_limits_behavior == XLB_EXIT || 
		    ocl_xceed_limits_behavior == XLB_EXITMUTE)
		{
			if (ocl_xceed_limits_behavior == XLB_EXIT)
				fprintf(stderr, "[opencl]: requested #threads (%d) exceeds device limit"
			          " (%ld).\n", num_threads, dev->device_team_max_threads);
			return 1;
		}
		else
		{
			if (ocl_xceed_limits_behavior == XLB_WARN)
				fprintf(stderr, "[opencl]: requested #threads (%d) exceeds device limit"
		        "; using %ld threads\n", num_threads, dev->device_team_max_threads);
			num_threads = dev->device_team_max_threads;
		}
	}
	*numthr = num_threads;
	*thrlim = thread_limit;
	return 0;
}


int oclgd_kernel_execute(ocl_gpu_t *dev, cl_kernel kernel,
                int min_teams, int max_teams, int num_threads, int thread_limit)
{
	cl_int volatile result;
	size_t global_work = 0;
	size_t work_group_size;
	cl_event kernevent;

	/* User requested for teams of threads */
	if (thread_limit <= 0)
		thread_limit = dev->device_team_max_threads;
	if (num_threads < 0)
		if (clGetKernelWorkGroupInfo(kernel, dev->cldevice, 
		                             CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), 
		                             &num_threads, NULL) != CL_SUCCESS)
			num_threads = dev->device_optimal_team_threads;
	if (num_threads == 0)
		num_threads = 1;
	if (num_threads > thread_limit)  /* both positive; we can compare */
		num_threads = thread_limit;
	work_group_size = num_threads;
	
	if (min_teams <= 0)
		min_teams = 1;
	if (max_teams <= 0)
		max_teams = 1;
	if (min_teams > max_teams)
		min_teams = max_teams;
	global_work = max_teams * work_group_size;
	
	DBGPRN((stderr, "\t[%s] : teams: %d, workgroup size: %ld\n",
	        __func__, max_teams, work_group_size));
	
	result = clEnqueueNDRangeKernel(dev->commands, kernel, 1, NULL,
	           (const size_t *) &global_work, (const size_t *) &work_group_size,
	           0, NULL, &kernevent);
	if (result != CL_SUCCESS)
	{
		fprintf(stderr, "[%s] clEnqueueNDRangeKernel failed: %s\n", __func__, 
		                ocl_errstr(result));
		return -1;
	}
	
	result = clFlush(dev->commands);    /* issue all pending commands */
	if (result != CL_SUCCESS)
	{
		fprintf(stderr, "[%s] clFlush failed to issue kernel: %s\n", __func__, 
		                 ocl_errstr(result));
		return -2;
	}
	DBGPRN1((stderr, "\t\t>> kernel enqued; about to block...\n"));

	result = clFinish(dev->commands);
	if (result != CL_SUCCESS)
	{
		fprintf(stderr, "[%s] clFinish failed: %s\n", __func__, ocl_errstr(result));
		return -3;
	}

#if ORT_DEBUG_LEVEL > 0
	{
		cl_ulong start, end;

		fprintf(stderr, "\t\t>> kernel completed (execution time: ");
		if (clGetEventProfilingInfo(kernevent, CL_PROFILING_COMMAND_START, 
		                            sizeof(cl_ulong), &start, NULL) ||
		    clGetEventProfilingInfo(kernevent, CL_PROFILING_COMMAND_END, 
		                            sizeof(cl_ulong), &end, NULL))
			fprintf(stderr, " - (clGetEventProfilingInfo() returned error)\n");
		else
			DBGPRN1((stderr, "%.3lfsec)\n", (end - start) * 1.0e-9));
	}
#endif
	return 0;
}

#undef DEBUG_LEVEL
