/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

#include "config.h"
#include "stddefs.h"
#include "cudautils.h"
#include "ptx.h"
#include "context.h"
#include "stream.h"
#include "set.h"

// #define DBGPRN_FORCE
// #define DBGPRN_BLOCK
#define DBGPRN_FILTER DBG_DEVICES

int available_cuda_gpus;     /* Number of CUDA-capable GPUs with SM >= 3.5 */
cuda_dev_t *cuda_gpus;    /* Global GPU bookkeeping array */

void __check_cuda_errors(const char *func, CUresult err, const char *file, const int line, 
						int destroy_context, cuda_dev_t *cuda_gpu)
{
	const char *error_desc;
	
	if (err != CUDA_SUCCESS) 
	{
		cuGetErrorString(err, &error_desc);
		fprintf(stderr,
				"[%s] %s: %s:%i: CUDA error: %s; exiting.\n",
				modulename, func, file, line, error_desc );
		if (destroy_context && cuda_gpu)
			cuda_ctx_test_destroy(cuda_gpu);
		exit(1);
	}
}

/**
 * Creates a new copy of src
 */
static
void *malloc_memcpy(void *src, size_t bytes)
{
	void *copy = (void*) malloc(bytes);
	if (copy == NULL)
	{
		fprintf(stderr, "[malloc_memcpy]: allocation failed\n");
		exit(1);
	}
	memcpy(copy, src, bytes);
	return copy;
}

/**
 * Calculates the number of CUDA cores for given SM 
 */
static
int get_sm_num_cores(int major, int minor) {
	int i;
	typedef struct {
		int sm;
		int cores;
	} sm_to_cores;

	/* 
	 * https://en.wikipedia.org/wiki/CUDA
	 * Number of ALU lanes for integer and single-precision 
	 * floating-point arithmetic operations.
	 */
	sm_to_cores num_gpu_arch_cores_per_sm[] = {
		{ 30, 192 }, { 32, 192 }, { 35, 192 }, { 37, 192 },
        /* Maxwell */
        { 50, 128 }, { 52, 128 }, { 53, 128 },
        /* Pascal */
        { 60,  64 }, { 61, 128 }, { 62, 128 },
        /* Volta / Turing */
        { 70,  64 }, { 72,  64 }, { 75,  64 },
        /* Ampere */
        { 80,  64 }, { 86, 128 }, { 87, 128 },
        /* Ada */
        { 89, 128 },
        /* Hopper */
        { 90, 128 },
		/* Blackwell */
		{ 100, 128 }, { 120, 128 },
		{ -1, -1 }
	};

	for (i = 0; num_gpu_arch_cores_per_sm[i].sm != -1; i++)
		if (num_gpu_arch_cores_per_sm[i].sm == major * 10 + minor)
			return num_gpu_arch_cores_per_sm[i].cores;
		
	return num_gpu_arch_cores_per_sm[i - 1].cores;
}	


/**
 * Calculates the number of CUDA cores for given SM 
 */
static
char *get_sm_arch_name(int major, int minor) {
	int i;
	typedef struct {
		int sm;
		char *name;
	} sm_to_arch_name;

	sm_to_arch_name sm_arch_names[] = {
		{ 30, "Kepler"  }, { 32, "Kepler"  }, { 35, "Kepler"  } , { 37, "Kepler" },     
		{ 50, "Maxwell" }, { 52, "Maxwell" }, { 53, "Maxwell" },    
		{ 60, "Pascal"  }, { 61, "Pascal"  }, { 62, "Pascal"  },     
		{ 70, "Volta"   },     
		{ 72, "Xavier"  },		
		{ 75, "Turing"  },     
		{ 80, "Ampere"  }, { 86, "Ampere"  }, { 87, "Ampere"  },     
		{ 89, "Ada"     }, 
		{ 90, "Hopper"  },
		{ 100, "Blackwell" }, { 120, "Blackwell" },
		{ -1, "Graphics Device" }
	};

	for (i = 0; sm_arch_names[i].sm != -1; i++)
		if (sm_arch_names[i].sm == major * 10 + minor)
			return sm_arch_names[i].name;
		
	return sm_arch_names[i - 1].name;
}


#if 0
/* 
 * Translates a host address to the corresponding CUDA mediary one
 */
static
CUdeviceptr get_host2med_addr(void *hostaddr, cuda_dev_t *cuda_gpu)
{
	setelem(hostmed_addrmap) e;

	if ((e = set_get(cuda_gpu->addr_map, hostaddr)) == NULL)
		return 0;

	return e->value;
}


/**
 * Translates a CUDA mediary address to the corresponding host one 
 */ 
static
void *get_med2host_addr(CUdeviceptr medaddr, cuda_dev_t *cuda_gpu)
{
	setelem(hostmed_addrmap) e;

	for (e = cuda_gpu->addr_map->first; e; e = e->next)
		if (e->value == medaddr)
			return e->key;

	return NULL;
}
#endif


/**
 * Copies host memory to newly allocated CUDA global memory
 */
static
void cuda_memAlloc_copyHtoD(void *hostaddr, CUdeviceptr **devaddr, size_t size, 
	                        cuda_dev_t *cuda_gpu)
{
	CUstream *stream = cuda_stream_get(cuda_gpu);
	cuda_do(cuMemAlloc(*devaddr, size));
	cuda_do(cuMemcpyHtoDAsync(**devaddr, hostaddr, size, *stream));
}


#if 0
/**
 * Calculates the two optimal dimensions of the execution
 * grid and block, given a team and a thread size.
 */
static
void set_2D_from_1D_kerneldims(int size, cuda_dim_t *dims)
{
	int min = INT_MAX, i;

	for (i = 1; i * i <= size; i++) 
	{
		if (size % i == 0)
		{
			if (size/i - i < min)
			{
				dims->X = i;
				dims->Y = size/i;
				min = (size/i) - i;
			}
		}
	}
		
	/* Currently we are 2D ... */
	dims->Z = 1;
}
#endif


static
void set_dims(cuda_dim_t *block_dims, int num_1d, unsigned long long dims)
{
	unsigned long x, y, z;

	_ull_decode3(dims, &x, &y, &z);
	block_dims->X = x ? ((!y && !z) ? num_1d : x) : num_1d;
	block_dims->Y = y ? y : 1;
	block_dims->Z = z ? z : 1;
}


/* 
 * Calculates number of available GPUs (SM >= 3.5)
 */
int cuda_get_num_gpus(void)
{
	CUdevice device;
	int i = 0;
	int device_count = 0;
	int sm_major = 0, sm_minor = 0;
	int ncudagpus = 0;

	cuda_do(cuInit(0));
	cuda_do(cuDeviceGetCount(&device_count));
	
	if (device_count == 0) {
		fprintf(stderr, 
			"cuda_get_num_gpus(): error: no CUDA-capable devices found\n");
		exit(1);
	}

	for (i = 0; i < device_count; i++)
	{
		cuda_do(cuDeviceGet(&device, i));
		cuda_do(cuDeviceGetAttribute(&sm_major, 
		                            CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, 
		                            device));

		cuda_do(cuDeviceGetAttribute(&sm_minor, 
		                            CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 
		                            device));
		
		/* Count only GPUs that have compute capability >= 3.5 */
		if ((sm_major << 4) + sm_minor >= 0x35)
			ncudagpus++;
	}

	return ncudagpus;
}

/**
 * Initializes the information structure of every CUDA-capable GPU.
 */
void cuda_init(void)
{
	int i;

	if (cuda_gpus == NULL)
	{
		if (available_cuda_gpus == 0)
			available_cuda_gpus = cuda_get_num_gpus();
		cuda_gpus = (cuda_dev_t *) malloc(available_cuda_gpus*sizeof(cuda_dev_t));
		if (cuda_gpus == NULL)
		{
			fprintf(stderr, "[cuda_init]: allocation failed\n");
			exit(1);
		}
	}
	
	for (i = 0; i < available_cuda_gpus; i++)
	{
		/* Get device with offset `i` */
		cuda_do(cuDeviceGet(&(cuda_gpus[i].device), i));
		
		/* Get SM version major and minor */
		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_sm_version_major), 
		                            CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, 
		                            cuda_gpus[i].device));

		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_sm_version_minor), 
		                            CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 
		                            cuda_gpus[i].device));
		
		/* Get maximum supported CUDA Toolkit version */
		cuda_do(cuDriverGetVersion(&(cuda_gpus[i].max_toolkit_version)));
							
		/* Get number of multiprocessors for the selected device */
		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_num_mp), 
		                            CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 
		                            cuda_gpus[i].device));
							
		/* Get maximum number of threads per block */
		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_max_thread_block_size), 
		                            CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 
		                            cuda_gpus[i].device));

		/* Get maximum number of blocks per grid */
		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_max_blocks_per_grid), 
		                            CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 
		                            cuda_gpus[i].device));									
												
		/* Get device global memory size */										
		cuda_do(cuDeviceTotalMem(&(cuda_gpus[i].device_global_mem_size), 
		                         cuda_gpus[i].device));

		/* Get device shared memory size */										
		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_shared_mem_size), 
		                            CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, 
		                            cuda_gpus[i].device));	

		/* Check if device supports unified addressing */										
		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_has_ua), 
		                            CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, 
		                            cuda_gpus[i].device));

		/* Check if device supports managed memory */										
		cuda_do(cuDeviceGetAttribute(&(cuda_gpus[i].device_has_mgmem), 
		                            CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, 
		                            cuda_gpus[i].device));

		/* Get device name */
		cuda_do(cuDeviceGetName(cuda_gpus[i].device_name, 256, 
		                        cuda_gpus[i].device));
		
		/* Get number of cores per multiprocessor, based on the SM */
		cuda_gpus[i].device_num_mp_cores = get_sm_num_cores(
		                                   cuda_gpus[i].device_sm_version_major, 
		                                   cuda_gpus[i].device_sm_version_minor);

		/* Get number of cores per multiprocessor, based on the SM */
		cuda_gpus[i].arch_name = get_sm_arch_name(cuda_gpus[i].device_sm_version_major, 
		                         cuda_gpus[i].device_sm_version_minor);

		cuda_gpus[i].status = DEVICE_UNINITIALIZED;
		pthread_key_create(&(cuda_gpus[i].ctx_key), NULL);
		pthread_key_create(&(cuda_gpus[i].stream_key), NULL);

#if defined(USE_CACHING)
		pthread_key_create(&(cuda_gpus[i].cache_key), NULL);
#endif
	}
}

/**
 * Clears global GPU bookkeeping array
 */
void cuda_clear(void)
{
	int i;
	if (cuda_gpus == NULL)
		return;

	for (i = 0; i < available_cuda_gpus; i++)
	{
		pthread_key_delete(cuda_gpus[i].ctx_key);
		pthread_key_delete(cuda_gpus[i].stream_key);
#if defined(USE_CACHING)
		pthread_key_delete(cuda_gpus[i].cache_key);
#endif
	}

	free(cuda_gpus);
	cuda_gpus = NULL;
	available_cuda_gpus = 0;
}

#define MODSUB(x,c) (((x)+CUDA_CACHE_SIZE-(c)) % CUDA_CACHE_SIZE)

/**
 * Tests if a thread-specific kernel cache is set. If not,
 * it initializes and sets a new one.
 */
void cuda_cache_test_init(cuda_dev_t *cuda_gpu)
{
#if defined(USE_CACHING)
	int i;
	cuda_kernel_cache_t *cache = 
		(cuda_kernel_cache_t*) pthread_getspecific(cuda_gpu->cache_key);

	if (cache == NULL)
	{
		cache = (cuda_kernel_cache_t*) malloc(sizeof(cuda_kernel_cache_t));
		cache->nkernels = 0;
		for (i = 0; i < CUDA_CACHE_SIZE; i++)
		{
			cache->kernels[i].name = NULL;
			cache->kernels[i].nargs = 0;
		}
		pthread_setspecific(cuda_gpu->cache_key, cache);
	}
#endif
}


/**
 * Tests if a thread-specific kernel cache is set. If so,
 * it destroys it and frees it.
 */
void cuda_cache_test_destroy(cuda_dev_t *cuda_gpu)
{
#if defined(USE_CACHING)
	cuda_kernel_cache_t *cache = 
		(cuda_kernel_cache_t*) pthread_getspecific(cuda_gpu->cache_key);

	if (cache != NULL)
		free(cache);
#endif
}

/**
 * Copies a variable to a symbol that resides in CUDA constant memory.
 */
void cuda_memcpy_const(cuda_dev_t *cuda_gpu, char *symbol, void *src, size_t bytes)
{
	CUdeviceptr devptr;
	CUstream *stream = cuda_stream_get(cuda_gpu);
	cuda_do(cuModuleGetGlobal(&devptr, NULL, cuda_gpu->module, symbol));
	cuda_do(cuMemcpyHtoDAsync(devptr, src, bytes, *stream));
}


/* 
 * Retrieves a CUDA kernel from the kernel cache,
 * otherwise creates & caches it.
 */
int cuda_kernel_cached_create(cuda_dev_t *cuda_gpu, void *(*host_func)(void *), 
	                          char *kernel_sources_filename, char *kernel_binary_filename, 
                              char *kernel_name)
{
	int i, nkernels, kernel_id = 0;
	bool cached = false;
	cuda_kernel_t *kernel;
	bubin_t *entry = NULL;
	bundling_e bundlingtype = ort_bundling_type();
	
#if defined(USE_CACHING)
	cuda_kernel_cache_t *cache = 
		(cuda_kernel_cache_t*) pthread_getspecific(cuda_gpu->cache_key);
	nkernels = cache->nkernels;

	/* We suppose that the kernel is not cached, thus
	given a new id */
	kernel_id = nkernels % CUDA_CACHE_SIZE;

	if (nkernels <= CUDA_CACHE_SIZE)
	{
		for (i = 0; i < nkernels; i++)
		{
			if (cache->kernels[i].host_func == host_func)
			{
				kernel_id = i;
				cached = true;
			}
		}
	}
	else
	{
		for (i = 1; i <= CUDA_CACHE_SIZE; i++)
		{
			if (cache->kernels[MODSUB(nkernels, i)].host_func == host_func)
			{
				kernel_id = MODSUB(nkernels, i);
				cached = true;
			}
		}
	}
		
	kernel = (cuda_kernel_t*) &(cache->kernels[kernel_id]);
#else
	kernel = cuda_gpu->kernel;
#endif

	if (!cached)
	{
		kernel->nargs = 0;
		kernel->name = strdup(kernel_name);
		kernel->filename = strdup(kernel_binary_filename);
		kernel->host_func = host_func;
		kernel->libdevpart = (char *) malloc(256); // TODO memory leak
		if (kernel->libdevpart == NULL)
		{
			fprintf(stderr, "[cuda_kernel_cached_create]: allocation failed\n");
			exit(1);
		}
#if defined(USE_CACHING)
		cache->nkernels++;
#endif
		/* Multiple scenarios here:
		 * (a) No kernel bundling is used, JIT compilation is enabled:
		 *     ptx kernels are loaded from disk, stored to strings
		 *     linked with libdevpart and loaded w/ cuModuleLoadData
		 * (b) No kernel bundling is used, kernels are .fatbins:
		 *     fatbin kernels are loaded with cuModuleLoad.
		 * (c) kernel bundling=SOURCES is used, JIT compilation is enabled:
		 *     ompicc hasn't produced any kernels, thus kernel sources are
		 *     created from within the executable and compiled to ptx on-the-fly.
		 *     Later they are linked with libdevpart and loaded w/ cuModuleLoadData.
		 * (d) kernel bundling=BINARIES is used, JIT compilation is enabled:
		 *     ptx kernels are loaded from within the executable and later
		 *     linked with libdevpart and loaded w/ cuModuleLoadData.
		 * (e) kernel bundling=SOURCES, kernels are .fatbins:
		 *     Again, kernel sources are created from within the executable and
		 *     compiled to fatbins on the fly and loaded w/ cuModuleLoad.
		 * (f) kernel bundling=BINARIES, kernels are .fatbins:
		 *     fatbin kernels are loaded from within the executable and loaded
		 *     w/ cuModuleLoadData.
		 */
		if (bundlingtype == BUNDLE_BINS)
			if ((entry = ort_bubins_search(kernel->filename)) == NULL)
			{
				fprintf(stderr, "Could not retrieve bundled kernel %s; exiting.\n",
				                kernel->filename);
				exit(EXIT_FAILURE);
			};
		if (bundlingtype == BUNDLE_SRCS)
			ort_bubins_unbundle_and_compile(kernel_sources_filename);

#if defined(CUDA_JIT_COMPILATION)
		char *ptx_source;
		unsigned int ptx_size;
		
		if (bundlingtype == BUNDLE_BINS)
		{
			ptx_size = entry->size;
			ptx_source = malloc(entry->size + 1);
			if (ptx_source == NULL)
			{
				fprintf(stderr, "[cuda_kernel_cached_create]: PTX allocation failed\n");
				exit(1);
			}
			memcpy(ptx_source, (char *) entry->data, entry->size);
			ptx_source[entry->size] = '\0'; // null-terminate it
	
			ptx_read_libdevpart_comment(ptx_source, &(kernel->libdevpart));
		}
		else
		{
			/* Read ptx file and store it to a str */
			ptx_source = ptx_read(kernel->filename, &(kernel->libdevpart));
			ptx_size = strlen(ptx_source);
		}
	
		if (ptx_source == NULL)
		{
			fprintf(stderr, 
				"[%s] ptx_compile: error: could not read ptx file; exiting.\n",
				modulename);
			exit(1);
		}
		ptx_compile_and_load(ptx_source, ptx_size, &(cuda_gpu->module), kernel);
#else
		if (bundlingtype == BUNDLE_BINS)
			cuda_do_crit(cuModuleLoadData(&(cuda_gpu->module),entry->data), cuda_gpu);
		else
			cuda_do_crit(cuModuleLoad(&(cuda_gpu->module),kernel->filename),cuda_gpu);
		cuda_do_crit(cuModuleGetFunction(&(kernel->function), cuda_gpu->module, 
				kernel->name), cuda_gpu); 
#endif
	}   /* if (!cached) */

	return kernel_id;
}

/**
 * Prepares CUDA kernel arguments for a GPU
 */ 
void cuda_kernel_set_args(cuda_dev_t *cuda_gpu, void *devdata, size_t devdata_size,
                          int kernel_id, int *num_args, void **args) 
{
	void *new_kernel_args[256];  /* Array used for storing kernel args temporarily */
	void *ptr_var;
	unsigned long offset_var=0;
	int ndeclargs, nfirstprivate, i, argoffset;
	CUdeviceptr *device_struct;
	cuda_kernel_t *kernel;
	int off = 0;

#if defined(USE_CACHING)
	cuda_kernel_cache_t *cache = 
		(cuda_kernel_cache_t*) pthread_getspecific(cuda_gpu->cache_key);

	if (kernel_id >= cache->nkernels)
	{
		fprintf(stderr, "%s: error: invalid kernel ID", __FUNCTION__);
		exit(1);
	}
	kernel = (cuda_kernel_t*) &(cache->kernels[kernel_id]);
#else
	kernel = cuda_gpu->kernel;
#endif

	device_struct = (CUdeviceptr*) malloc(sizeof(CUdeviceptr));
	if (device_struct == NULL)
	{
		fprintf(stderr, "[cuda_kernel_set_args]: allocation failed\n");
		exit(1);
	}

	/* Handle target declare arguments */
	ndeclargs = num_args[ARGS_NUMDECL];

	for (i = 0; i < ndeclargs; i++, kernel->nargs++)
	{
		ptr_var = (void*) args[off++];
		new_kernel_args[i+1] = 
			(void*) malloc_memcpy(&ptr_var, sizeof(CUdeviceptr));
	}
	
	if (devdata != NULL)
	{
		/* Allocate memory for dev data struct and copy it to device */
		cuda_memAlloc_copyHtoD(devdata, &device_struct, devdata_size, cuda_gpu);

		/* Handle firstprivate arguments */
		nfirstprivate = num_args[ARGS_NUMFIP];
		
		for (i = ndeclargs; i < ndeclargs + nfirstprivate; i++, kernel->nargs++)
		{
			ptr_var = (void *) args[off++];
			new_kernel_args[i+1] = 
				(void*) malloc_memcpy(&ptr_var, sizeof(CUdeviceptr));
		}

		/* Prepare the rest of the kernel arguments */
		ptr_var = (void *) args[off++];
		for (i = 0; i < num_args[ARGS_NUMMAPPED]; i++, kernel->nargs++)
		{
			argoffset = ndeclargs + nfirstprivate + i + 1;
			if ((i % 2) == 1)
			{
				/* As far as offsets are concerned, we just need
				to pass them by value. Allocating memory on host and 
				copying the var arg is the minimum we can do here. */
				new_kernel_args[argoffset] = (void*) malloc_memcpy(&offset_var, 
					sizeof(unsigned long));
				ptr_var = (void*) args[off++];
			}
			else
			{
				/* Memory for the kernel arguments has already been allocated 
				in the device. ptr_var is a mediary address, however keeping
				a copy on host is necessary for offloading. */
				new_kernel_args[argoffset] = (void*) malloc_memcpy(&ptr_var, 
					sizeof(CUdeviceptr));
				offset_var = (unsigned long) args[off++];
			}
		}
	}
	else 
	{
		/* If there was no dev data struct, allocate 1 byte of memory
		and pass it to the kernel, as kernels do not allow NULL
		parameters. */
		cuda_do(cuMemAlloc(device_struct, 1));
	}

	new_kernel_args[0] = (void*) device_struct;
	kernel->nargs++;

	/* Copy new arguments */
	kernel->args = (void**) malloc(kernel->nargs*sizeof(void*));
	if (kernel->args == NULL)
	{
		fprintf(stderr, "[cuda_kernel_set_args]: kernel args allocation failed\n");
		exit(1);
	}

	for (i = 0; i < kernel->nargs; i++)
	{
		kernel->args[i] = new_kernel_args[i];
		new_kernel_args[i] = NULL;
	}
}

/* 
 * Launches a CUDA kernel
 */
int cuda_kernel_launch(cuda_dev_t *cuda_gpu, int kernel_id, int num_teams,
                       int num_threads, unsigned long long teamdims, 
                       unsigned long long thrdims, int thread_limit)
{
	int i;
	cuda_kernel_t *kernel;
	cuda_dim_t *grid_dims, *block_dims;

#if defined(USE_CACHING)
	cuda_kernel_cache_t *cache = 
		(cuda_kernel_cache_t*) pthread_getspecific(cuda_gpu->cache_key);

	if (kernel_id >= cache->nkernels)
	{
		fprintf(stderr, "%s: error: invalid kernel ID", __FUNCTION__);
		return 1;
	}
	
	kernel = (cuda_kernel_t*) &(cache->kernels[kernel_id]);
#else
	kernel = cuda_gpu->kernel;
#endif

	grid_dims = &(kernel->grid_dims);
	block_dims = &(kernel->block_dims);
	
	set_dims(grid_dims, num_teams, teamdims);
	set_dims(block_dims, num_threads, thrdims);
 
	DBGPRN((stderr, "%s: num_teams %d - num_threads %d - thread_limit %d\n", 
	                __FUNCTION__, num_teams, num_threads, thread_limit));

	DBGPRN((stderr, "%s: grid size: %d x %d x %d\n", 
	                __FUNCTION__, grid_dims->X, grid_dims->Y, grid_dims->Z));

	DBGPRN((stderr, "%s: block size: %d x %d x %d\n", 
	                __FUNCTION__, block_dims->X, block_dims->Y, block_dims->Z));
		
	/* Launch kernel with specified grid and block dimensions */
#if 0
	if (cuda_gpu->device_sm_version_major >= 6)
		cuda_do_crit(cuLaunchCooperativeKernel(kernel->function, 
	             grid_dims->X, grid_dims->Y, grid_dims->Z,
	             block_dims->X, block_dims->Y, block_dims->Z,
	             CUDA_SHMEM_SIZE, 0, kernel->args), cuda_gpu);
	else
#else 
		cuda_do_crit(cuLaunchKernel(kernel->function, 
		             grid_dims->X, grid_dims->Y, grid_dims->Z,
		             block_dims->X, block_dims->Y, block_dims->Z,
		             CUDA_SHMEM_SIZE, 0, kernel->args, 0), cuda_gpu);
#endif

	cuCtxSynchronize();

	for (i = 0; i < kernel->nargs; i++)
		free(kernel->args[i]);
	free(kernel->args);
	kernel->nargs = 0;

	cuda_gpu->num_launched_kernels++;
	return 0;
}

#if defined(DEBUG)
#undef DEBUG
#endif
