/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* Define for debugging */
//#define DBGPRN_FORCE
//#define ORT_DEBUG_LEVEL 2

#include <string.h>
#include <assert.h>
#include <errno.h>
#include "oclgpudev.h"

#if ORT_DEBUG_LEVEL > 0
	#include <sys/time.h>
	#define SECPASSED(t2,t1) \
            (((t2).tv_sec - (t1).tv_sec) + ((t2).tv_usec - (t1).tv_usec)*1.0e-6)
#endif

char *modulename = NULL;
int  hm_unified_medaddr = 1;   /* Internal/usable mediary addresses identical */
static int _global_omp_id_of_first_device = 400;


/* Pointers to lock functions of the host runtime
 */
void (*init_lock)(void **lock, int type);
void (*lock)(void **lock);
void (*unlock)(void **lock);
int  (*strprintf)(str s, char *fmt, ...);


/**
 * Calculates the number of available devices supported by this module
 *
 * @return number of devices
 */
int hm_get_num_devices(void)
{
	oclgd_init(false);
	return ocl_num_gpus;
}


void _opencl_print_device_extensions(cl_device_id cldev, str info)
{
	size_t  retsize;
	char    *query;
	
	clGetDeviceInfo(cldev, CL_DEVICE_EXTENSIONS, 0, NULL, &retsize);
	if (retsize <= 0)
		return;
	if ((query = malloc(retsize)) == NULL)
		return;
	clGetDeviceInfo(cldev, CL_DEVICE_EXTENSIONS, retsize, query, NULL);
	
	/* Extra info on Intel GPUs */
#ifndef CL_DEVICE_NUM_SLICES_INTEL
	#define CL_DEVICE_NUM_SLICES_INTEL 0x4252
#endif
#ifndef CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL
	#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253
#endif
#ifndef CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL
	#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254
#endif
#ifndef CL_DEVICE_NUM_THREADS_PER_EU_INTEL
	#define CL_DEVICE_NUM_THREADS_PER_EU_INTEL 0x4255
#endif
	if (strstr(query, "cl_intel_device_attribute_query"))
	{
		cl_uint s;
		strprintf(info, "\n  Intel GPU details:\n");
		if (clGetDeviceInfo(cldev, CL_DEVICE_NUM_SLICES_INTEL,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u slice(s)\n", s);
		if (clGetDeviceInfo(cldev, CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u subslice(s) per slice (max)\n", s);
		if (clGetDeviceInfo(cldev, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u EUs per subslice (max)\n", s);
		if (clGetDeviceInfo(cldev, CL_DEVICE_NUM_THREADS_PER_EU_INTEL,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u threads per EU (max)\n", s);
	}

/* Extra info on AMD GPUs */
#ifndef CL_DEVICE_BOARD_NAME_AMD
	#define CL_DEVICE_BOARD_NAME_AMD 0x4038
#endif
#ifndef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
	#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
#endif
#ifndef CL_DEVICE_SIMD_WIDTH_AMD
	#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
#endif
#ifndef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
	#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
#endif
#ifndef CL_DEVICE_WAVEFRONT_WIDTH_AMD
	#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
#endif
#ifndef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
	#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
#endif
#ifndef CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD
	#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030
#endif
#ifndef CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD
	#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031
#endif

	if (strstr(query, "cl_amd_device_attribute_query"))
	{
		cl_uint s;
		char bn[129];

		strprintf(info, "\n  AMD GPU details:\n");
		if (clGetDeviceInfo(cldev, CL_DEVICE_BOARD_NAME_AMD,
		                    sizeof(bn), &bn, NULL) == CL_SUCCESS)
			strprintf(info, "    | Board name: %s\n", bn);
		if (clGetDeviceInfo(cldev, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u SIMD unit(s) per compute unit (CU)\n", s);
		if (clGetDeviceInfo(cldev, CL_DEVICE_SIMD_WIDTH_AMD,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u concurrent instructions per SIMD unit\n", s);
		if (clGetDeviceInfo(cldev, CL_DEVICE_WAVEFRONT_WIDTH_AMD,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u workitems per wavefront\n", s);
		if (clGetDeviceInfo(cldev, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u is the preferred workgroup size\n", s);
	}

	/* Extra info on NVIDIA GPUs */
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
	#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
	#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#endif
#ifndef CL_DEVICE_REGISTERS_PER_BLOCK_NV
	#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#endif
#ifndef CL_DEVICE_WARP_SIZE_NV
	#define CL_DEVICE_WARP_SIZE_NV 0x4003
#endif

	if (strstr(query, "cl_nv_device_attribute_query"))
	{
		cl_uint s, t;

		strprintf(info, "\n  NVIDIA GPU details:\n");
		if (clGetDeviceInfo(cldev, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS &&
		    clGetDeviceInfo(cldev, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
		                    sizeof(cl_uint), &t, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u.%u compute capability\n", s, t);
		if (clGetDeviceInfo(cldev, CL_DEVICE_WARP_SIZE_NV,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u warp size\n", s);
		if (clGetDeviceInfo(cldev, CL_DEVICE_REGISTERS_PER_BLOCK_NV,
		                    sizeof(cl_uint), &s, NULL) == CL_SUCCESS)
			strprintf(info, "    | %u registers per block\n", s);
	}

	free(query);
}


/**
 * Returns information for this module and the available devices
 *
 * @param devid_offset the id of the first device available from this module
 */
void  hm_print_information(int devid_offset, str info)
{
	int i, j;
	ocl_gpu_t *dev;
	void *hm_dev_init(int, ort_icvs_t *, int *);
	void hm_dev_end(void *);

	oclgd_init(false);
	for (i = 0; i < ocl_num_gpus; i++)
		hm_dev_init(i, NULL, NULL);

	strprintf(info, "OMPi module for OpenCL devices\n");
	strprintf(info, "Available devices : %d\n\n", ocl_num_gpus);
	for (i = 0; i < ocl_num_gpus; i++)
	{
		dev = oclgd_get(i);
		strprintf(info, "device id < %d > { \n", devid_offset + i);
		strprintf(info, "  GPU device name        : %s\n",
		        dev->device_name);
		strprintf(info, "  GPU device vendor      : %s\n",
		        dev->device_vendor);
		strprintf(info, "  Vendor device driver   : %s\n",
		        dev->device_driver_version);
		strprintf(info, "  OpenCL ver. (platform) : %d.%d\n",
		        VERMAJ(dev->pver), VERMIN(dev->pver));
		strprintf(info, "  OpenCL ver. (device)   : %d.%d\n",
		        VERMAJ(dev->dver), VERMIN(dev->dver));
		strprintf(info, "  OpenCL ver. (compiler) : ");
		for (j = 0; j < dev->num_cvers; j++)
			strprintf(info, "%d.%d%s", VERMAJ(dev->cvers[j]), VERMIN(dev->cvers[j]),
			                           (j == dev->num_cvers-1) ? "\n" : ", ");
		strprintf(info, "  Num of compute units   : %d\n",
		        dev->device_max_computeinits);
		strprintf(info, "  Max workgroup size     : %d\n",
		        dev->device_team_max_threads);
		strprintf(info, "  Local memory           : %ld KBytes\n",
		        (dev->device_local_mem_size) / 1024);
		strprintf(info, "  Global memory          : %.1lf GBytes\n",
		        ((double) dev->device_global_mem_size) / (1024*1024*1024));
		strprintf(info, "  Coarse grain SVM       : %s\n",
		        dev->svm_coarse ? "yes" : "no");
		strprintf(info, "  Fine grain buffer SVM  : %s\n",
		        dev->svm_fine_buffer ? "yes" : "no");
		strprintf(info, "  Fine grain system SVM  : %s\n",
		        dev->svm_fine_system ? "yes" : "no");
		strprintf(info, "  Unified host memory    : %s\n",
		        dev->unified_memory_12 ? "yes" : "no"); 
		strprintf(info, "  Supports \"double\" type : %s\n",
		        dev->doublesupport ? "yes" : "no");
		/*
		strprintf(info, "  Workgroup collectives  : %s\n",
		        dev->haswgcollectives ? "yes" : "no");
		 */
		_opencl_print_device_extensions(dev->cldevice, info);
		strprintf(info, "}%s", i == ocl_num_gpus-1 ? "\n" : "\n\n");

		hm_dev_end(dev);
	}

	oclgd_finish();
}


/**
 * Registers host runtime functions (currently it registers functions for locks)
 *
 * @param str_printf_in pointer to the function used for printing to a string
 */
void hm_register_str_printf(int  (*str_printf_in)(str s, char *fmt, ...))
{
	strprintf = str_printf_in;
}


/** 
 * Initializes the OpenCL module
 * 
 * @param modname the name of the module
 * @param global_omp_id_of_first_device the global OpenMP ID of the first device
 * @param init_lock_in pointer to the function used for initializing a lock.
 *                     It's parameters are the address of a "void *" variable
 *                     and one of the "ORT_LOCK_*" defines denoting the type of
 *                     the lock
 * @param lock_in      pointer to the function used for acquiring a lock
 * @param unlock_in    pointer to the function used for releasing a lock
 * @param hyield_in    pointer to the function used for thread yield
 * 
 * @return             the number of available devices on success, 0 on failure
 */
int hm_initialize(char *modname, int global_omp_id_of_first_device,
                  void (*init_lock_in)(void **lock, int type),
                  void (*lock_in)(void **lock),
                  void (*unlock_in)(void **lock),
                  int  (*hyield_in_ignore)(void))
{
	/* (1) Set my name */
	modulename = strdup(modname ? modname : "noname");

	/* (2) Keep the global ID of the first device */
	_global_omp_id_of_first_device = global_omp_id_of_first_device;

	/* (3) Register EE calls */
	init_lock = init_lock_in;
	lock      = lock_in;
	unlock    = unlock_in;

	/* (4) Initialize OpenCL */
	oclgd_init(true);

	return ocl_num_gpus;
}


/** 
 * Finalizes the OpenCL module
 */
void hm_finalize(void)
{
	int i;
	void hm_dev_end(void *);

	/* (1) Finalize all devices */
	for (i = 0; i < ocl_num_gpus; i++)
		hm_dev_end(oclgd_get(i));

	/* (2) Finalize OpenCL */
	oclgd_finish();

	/* (3) Free module name */
	if (modulename)
	{
		free(modulename);
		modulename = NULL;
	}
}


/**
 * Initializes a device
 *
 * @param dev_num     the (local) id of the device to initialize
 *                    (0 <= dev_num < hm_get_num_devices())
 * @param ort_icv     Pointer to struct with
 *                    initial values for the device ICVs.
 * @param sharedspace (ret) set to true if the device address space 
 *                    is identical to host (default: false)
 * @param argc        Pointer to main function's argc.
 * @param argv        Pointer to main function's argv.
 *
 * @return device_info: arbitrary pointer that will be passed back in
 *         following calls (see below).
 *         Return NULL only if it failed to initialize.
 */
void *hm_dev_init(int dev_num, ort_icvs_t *ort_icv, int *sharedspace)
{
	cl_device_id   devid;
	cl_platform_id platid;
	ocl_gpu_t      *dev;

#if ORT_DEBUG_LEVEL > 0
	struct timeval t1, t2, t3;
	DBGPRN((stderr, "[opencl module] : %s\n", __func__));
	gettimeofday(&t1, NULL);
#endif

	oclgd_init(true);
	if (ocl_discover_ith_gpu(dev_num, &devid, &platid))
		return NULL;

#if ORT_DEBUG_LEVEL > 0
	gettimeofday(&t2, NULL);
	DBGPRN((stderr, "\t>> oclgd_init+discover (%.3lf sec)\n", SECPASSED(t2,t1)));
#endif

	dev = oclgd_get(dev_num);

	if (dev->status == DEVICE_INITIALIZED)
		return dev;
		
	dev->devid = dev_num;
	if (oclgd_prepare(dev, devid, platid))
		return NULL;
	if (ort_icv != NULL) /* NULL only when called from hm_print_information */
		dev->dev_icvs = *ort_icv;

	dev->sharedspace = 0; /* For now, set to 0 for all devices */
	if (sharedspace)
		*sharedspace = 0;

#if ORT_DEBUG_LEVEL > 0
	gettimeofday(&t3, NULL);
	DBGPRN((stderr, "\t>> oclgd_prepare (%.3lf sec)\n", SECPASSED(t3,t2)));
#endif

	dev->status = DEVICE_INITIALIZED; /* Mark as initialized */

	return dev;
}


/**
 * Finalizes a device
 *
 * @param device_info the device (the pointer returned by hm_dev_init())
 */
void hm_dev_end(void *device_info)
{
	ocl_gpu_t *dev = (ocl_gpu_t *) device_info;
	if (dev->status != DEVICE_INITIALIZED) return;
	dev->status = DEVICE_UNINITIALIZED; /* Mark as uninitialized */
	olcgd_close(dev);
}


/**
 * Apart from the necessary arguments, we are forced to pass extra stuff to 
 * the kernel, which are only passed to the devpart runtime. We create a 
 * buffer and store an array of integers there. When arguments are passed to 
 * the kernel, this buffer is also passed.
 *
 * We would like the buffer to be @ constant memory: initialize it from the 
 * host and enqueue a write; we also tried to use a ready-made buffer using 
 * clCreateBuffer(dev->context, CL_MEM_READ_ONLY|CL_MEM_HOST_NO_ACCESS|
 * CL_MEM_COPY_HOST_PTR, size, buffer, &err); nothing worked on recent NVIDIA 
 * GPUs; so we gave up and used global memory. It seems like NVIDIA constant 
 * memory works only when initialized within a kernel. Working with OpenCL at 
 * this level can be easily considered some kind of dark art.
 */
cl_mem _pass_xtra_info(ocl_gpu_t *dev, int devid, int thrlim)
{
	cl_int err;
	cl_mem xbuf;
	cl_int buffer[32] = { devid, thrlim };  /* Let it be 128 bytes */

	/* Use CL_MEM_READ_ONLY to hint we won't change it */
	xbuf = clCreateBuffer(dev->context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS |
                        CL_MEM_COPY_HOST_PTR, 32*sizeof(cl_int), buffer, &err);
	if (xbuf == NULL || err != CL_SUCCESS)
	{
		fprintf(stderr, "[%s]: %s failed on clCreateBuffer(): %s\n",
		        modulename, __func__, ocl_errstr(err));
		return NULL;
	}
	return xbuf;
}


/**
 * Offloads and executes a kernel file.
 *
 * @param device_info       the device (the pointer returned by hm_dev_init())
 * @param host_func pointer to offload function on host address space
 * @param dev_data pointer  to a struct containing kernel variables
 * @param decl_data pointer to a struct containing globally declared variables
 * @param kernel_filename   filename of the kernel (without the suffix)
 * @param max_teams         from the num_teams clause of the #teams construct
 * @param num_threads       num_threads clause from combined #parallel construct
 * @param thread_limit      thread_limit clause from #teams construct; a 
 *                          value <= 0 signifies no such clause was given.
 * @param teamdims          an unsigned long long that contains the dimensions
 *                          of the launched league, encoded as follows:
 *                          x: bits 0-20, y: bits 21-41, z: bits 42-62
 * @param thrdims           an unsigned long long that contains the
 *                          dimensions of each thread team, encoded as follows:
 *                          x: bits 0-20, y: bits 21-41, z: bits 42-62
 * @param num_args          an array that contains the number of declare vars,
 *                          firstprivates and mapped variables
 * @param args              the addresses of all target data and target
 *                          declare variables
 *
 * NOTE: `teamdims' and `thrdims' can be decoded using the _ull_decode3 function
 */
int hm_offload(void *device_info, void *(*host_func)(void *), void *dev_data,
               void *decl_data, char *kernel_filename, int max_teams,
               int num_threads, int thread_limit,
               unsigned long long teamdims, unsigned long long thrdims,
               int *num_args, void **args)
{
	ocl_gpu_t  *dev = (ocl_gpu_t *) device_info;
	ocl_program_t *oclprog;
	cl_program linkprogs[2];            // the kernel and library programs
	cl_program program;                 // the final compute program
	cl_kernel  kernel;                  // compute kernel
	cl_int     status;
	cl_mem     xtrainfo;
	char       *codesrc, *filepath, *filepath_sources, *info, flavor[64] = { 0 };
	int        hasdouble, haslonglong;
	/* TODO min_teams is missing from the hm_offload arguments.
	 * We introduce it here so as to be ready for the future
	 */
	int min_teams = max_teams;   /* ... */

#if ORT_DEBUG_LEVEL > 0
	struct timeval t1, t2, t3, t4, t5, t6, t7, t8;
	DBGPRN((stderr, "[opencl module] : %s\n", __func__));
	gettimeofday(&t1, NULL);
	t6 = t1;        /* In case we get a cached kernel */
#endif

	if ((oclprog = oclgd_get_cached_program(dev, host_func)) == NULL)
	{
		/* 1. Read the source file */
		filepath = (char *) malloc((strlen(kernel_filename) + 12) * sizeof(char));
		filepath_sources = (char *) 
		                   malloc((strlen(kernel_filename) + 11) * sizeof(char));
		filepath = strcpy(filepath, kernel_filename);
		filepath_sources = strcpy(filepath_sources, kernel_filename);
		filepath = strcat(filepath, "-opencl.out");
		filepath_sources = strcat(filepath_sources, "-opencl.cl");
		DBGPRN((stderr, "  -- kernel : %s (not cached)\n", filepath));

		switch (ort_bundling_type())
		{
			case BUNDLE_BINS:
				{
					bubin_t *b;
					if ((b = ort_bubins_search(filepath)) == NULL)
					{
						fprintf(stderr, "[%s] %s failed to read the kernel binary (%s)\n",
						        modulename, __func__, filepath);
						return 1;
					}
					codesrc = malloc(b->size + 1);
					memcpy(codesrc, (char *) b->data, b->size);
					codesrc[b->size] = '\0';
					break;
				}
			case BUNDLE_SRCS:
				ort_bubins_unbundle_and_compile(filepath_sources);
				/* fall through */
			default:    /* no bundling */
				if ((codesrc = ocl_read_src(filepath)) == NULL)
				{
					fprintf(stderr, "[%s] %s failed to read the kernel source (%s)\n",
					        modulename, __func__, filepath);
					return 1;
				}
		}
		
		if (strncmp(codesrc, "/* MALflavor: ", 14) == 0) /* Get MAL flavor */
		{
			sscanf(codesrc+14, "%63s", flavor);
			if (strcmp(flavor, "default") == 0)
				*flavor = 0;
		}
		/* Parse comments and check for illegal type usage */
		if ((info = strstr(codesrc, "$OCL_info:")) != NULL)
			sscanf(info+10, "%d,%d", &hasdouble, &haslonglong);
		if (haslonglong)
		{
			fprintf(stderr, "[%s module]: unsupported long long types in %s.\n",
			                modulename, filepath);
			return 1;
		}
		if (hasdouble && !dev->doublesupport)
		{
			fprintf(stderr, "[%s module]: %s uses 'double' type, which is\n"
			                "\t unsupported in device %s.\n",
			                modulename, filepath, dev->device_name);
			return 1;
		}

#if ORT_DEBUG_LEVEL > 0
		gettimeofday(&t2, NULL);
		DBGPRN((stderr, "\t>> kernel file loaded (%.3lf sec)\n", SECPASSED(t2,t1)));
#endif

		/* 2. Create the OpenCL program */
		if ((linkprogs[0] = clCreateProgramWithSource(dev->context, 1,
		                          (const char **) &codesrc, NULL, &status)) == NULL)
		{
			fprintf(stderr,"[%s] %s failed on clCreateProgramWithSource (%s)\n\t%s\n",
			        modulename, __func__, filepath, ocl_errstr(status));
			return 1;
		}
		free(codesrc);

#if ORT_DEBUG_LEVEL > 0
		gettimeofday(&t3, NULL);
		DBGPRN((stderr, "\t>> program created (%.3lf sec)\n", SECPASSED(t3,t2)));
#endif

		/* 3. Compile the OpenCL program */
		status = clCompileProgram(linkprogs[0], 1, &dev->cldevice,
		                          dev->svm_coarse ?
		                          "-I ./ -D__OMPI_CL_KERNEL__ -cl-std=CL2.0" :
		                           (ocl_opencl_c_majver_supported(dev, 0) == 3 ?
		                             "-I ./ -D__OMPI_CL_KERNEL__ -cl-std=CL3.0" :
		                             "-I ./ -D__OMPI_CL_KERNEL__"),
		                          0, NULL, NULL, NULL, NULL);
		if (status != CL_SUCCESS)
		{
			fprintf(stderr,"[%s] %s failed on clCompileProgram (%s):\n>> %s\n-----\n", 
			        modulename, __func__, filepath, ocl_errstr(status));
			ocl_show_clprogram_info(linkprogs[0], dev->cldevice, filepath);
			return 1;
		}

#if ORT_DEBUG_LEVEL > 0
		gettimeofday(&t4, NULL);
		DBGPRN((stderr, "\t>> program compiled (%.3lf sec)\n", SECPASSED(t4,t3)));
#endif
#if ORT_DEBUG_LEVEL > 1
		ocl_show_clprogram_info(linkprogs[0], dev->cldevice, filepath);
#endif

		/* Load ompi libraries */
		if ((linkprogs[1] = oclgd_load_devpart(dev, ModuleDir, flavor)) == NULL)
		{
			clReleaseProgram(linkprogs[0]);
			fprintf(stderr, "[%s] %s failed on oclgd_load_devpart (%s)\n",
			        modulename, __func__, filepath);
			return 1;
		}

#if ORT_DEBUG_LEVEL > 0
		gettimeofday(&t5, NULL);
		DBGPRN((stderr, "\t>> devpart loaded (%.3lf sec)\n", SECPASSED(t5,t4)));
#endif
#if ORT_DEBUG_LEVEL > 2
		ocl_show_clprogram_info(linkprogs[0], dev->cldevice, filepath);
		ocl_show_clprogram_info(linkprogs[1], dev->cldevice, "ompi_ocl_rtlib");
#endif

		/* Link user kernel with ompi lib to get final program.
		 * Important notice: we found out the hard way that for some platforms
		 * this works only if both programs were created using the same context.
		 */
		if ((program = clLinkProgram(dev->context, 1, &dev->cldevice, NULL,
		                             2, linkprogs, NULL, NULL, &status)) == NULL)
		{
			fprintf(stderr, "[%s] %s failed on clLinkProgram (%s):\n>>> %s\n---\n",
			       modulename, __func__, filepath, ocl_errstr(status));
			ocl_show_clprogram_info(linkprogs[0], dev->cldevice, filepath);
			ocl_show_clprogram_info(linkprogs[1], dev->cldevice, "ompi_ocl_rtlib");
			return 1;
		}

#if ORT_DEBUG_LEVEL > 0
		gettimeofday(&t6, NULL);
		DBGPRN((stderr,"\t>> kernel+devpart linked (%.3lfsec)\n",SECPASSED(t6,t5)));
#endif
#if ORT_DEBUG_LEVEL > 2
		ocl_show_clprogram_info(program, dev->cldevice, filepath);
#endif
		clReleaseProgram(linkprogs[0]);
		oclprog = oclgd_add_cached_program(dev, host_func, filepath, 
		                                   filepath_sources, program);
	}
	
	program = oclprog->program;
	filepath = oclprog->filepath;
	
	/* Create the opencl kernel */
	if ((kernel = clCreateKernel(program, "_kernel_wrapper_default_", &status)) 
	       == NULL)
	{
		fprintf(stderr, "[%s] %s failed on clCreateKernel (%s):\n\t%s\n",
		        modulename, __func__, filepath, ocl_errstr(status));
		return 1;
	}

#if ORT_DEBUG_LEVEL > 0
	gettimeofday(&t7, NULL);
	DBGPRN((stderr, "\t>> final kernel created (%.3lf sec)\n", SECPASSED(t7,t6)));
#endif
#if ORT_DEBUG_LEVEL > 1
	ocl_show_clkernel_info(kernel, dev->cldevice, kernel_filename);
#endif

	/* Make sure we have legal values */
	if (ocldg_setup_threading(dev, kernel, &num_threads, &thread_limit))
		return 1;
	/* Pass all kinds of extra stuff to the runtime through constant memory */
	xtrainfo = _pass_xtra_info(dev, dev->devid + _global_omp_id_of_first_device, 
	                           thread_limit);
	
	/* Handle kernel arguments (protect non-safe CL arg calls) */
	lock(&oclprog->lock);
	oclgd_pass_args_to_kernel(dev, kernel, num_args, args, xtrainfo);
	unlock(&oclprog->lock);

	/* ... and finally execute the kernel. */
	if (oclgd_kernel_execute(dev, kernel, min_teams, max_teams, num_threads, 
	                         thread_limit))
		return 1;
#if ORT_DEBUG_LEVEL > 0
	gettimeofday(&t8, NULL);
	DBGPRN((stderr, "\t>> kernel executed "
	                "(real time elapsed: %.3lf sec)\n", SECPASSED(t8,t1)));
#endif
	return 0;
}


/* Checks if ptr is properly aligned for using it with ocl_use_host_ptr */
bool isAlignedForCL(void *ptr, unsigned int areasize)
{
	return (((unsigned long int)ptr) % 4096 == 0) && (areasize % 64 == 0);
}


#ifndef CL_MEM_SVM_FINE_GRAIN_BUFFER
	#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10)
#endif


/**
 * Allocates memory "on the device"
 *
 * @param device_info the device (the pointer returned by hm_dev_init())
 * @param size        the number of bytes to allocate
 * @param map_memory  used in OpenCL, when set to 1 additionaly to the memory
 *                    allocation in shared virtual address space, the memory
 *                    is mapped with read/write permissions so the host cpu
 *                    can utilize it.
 * @param hostaddr    used in MPI to allocate #declare target link variables;
 *                    you can safely ignore this argument.
 * @param map_type    the mapping type that triggered this allocation (to/from/tofrom/alloc)
 * @return            pointer to the allocated space (internal mediary address)
 */
void *hm_dev_alloc(void *device_info, size_t size, int map_memory, void *hostaddr,
                   int map_type)
{
	ocl_gpu_t *dev = (ocl_gpu_t *) device_info;
	
	if (size <= 0) return NULL;
	if (map_memory)
		return (malloc(size));  /* We will ignore it anyway... (FIXME: memleak )*/

#ifdef OCL_USE_CL2_SVM_ARGS
	if (dev->svm_coarse)
	{
		void *new_mem = clSVMAlloc(dev->context,
		                           dev->svm_fine_buffer ?
		                             CL_MEM_READ_WRITE|CL_MEM_SVM_FINE_GRAIN_BUFFER:
		                             CL_MEM_READ_WRITE, size, 0);
		if (new_mem == NULL)
			fprintf(stderr, "[%s]: warning: %s failed on clSVMAlloc().\n",
			        modulename, __func__);
		return (new_mem);
	}
	else   /* OpenCL 1.2 */
#endif
	{
		cl_mem buf;
		cl_int err;

		buf = (ocl_use_host_ptr && isAlignedForCL(hostaddr, size)) ?
		        clCreateBuffer(dev->context, /* Try to possibly use zero-copy */
		            CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size, hostaddr, &err) :
		        clCreateBuffer(dev->context, CL_MEM_READ_WRITE, size, NULL, &err);
		if (buf == NULL || err != CL_SUCCESS)
			fprintf(stderr, "[%s]: %s failed on clCreateBuffer(): %s\n",
			        modulename, __func__, ocl_errstr(err));
		return (void *) buf;
	}
}


/**
 * Allocates & initializes memory "on the device" for a global variable
 *
 * @param device_info the device (the pointer returned by hm_dev_init())
 * @param global_id   the ID of the global variable
 * @param size        the number of bytes to allocate

 * @return            pointer to the allocated space (internal mediary address)
 */
void *hm_dev_init_alloc_global(void *device_info, void *initfrom, size_t size, 
                               int global_id, void *hostaddr)
{
	void hm_todev(void *device_info, void *hostaddr, size_t hostoffset,
	              void *imedaddr, size_t devoffset, size_t size);
	void *addr = hm_dev_alloc(device_info, size, 0, hostaddr, MAP_TYPE_IGNORE);

	if (ocl_use_host_ptr && isAlignedForCL(hostaddr, size))
	{
		cl_int err = clGetMemObjectInfo((cl_mem) addr, CL_MEM_HOST_PTR,
		                                sizeof(void *), &hostaddr, NULL);
		if (err == CL_SUCCESS && hostaddr != NULL)   /* no transfer then */
			return addr;
	}
	hm_todev(device_info, initfrom, 0L, addr, 0L, size);
	return addr;
}


/**
 * Frees data allocated with hm_dev_alloc
 *
 * @param device_info  the device (the pointer returned by hm_dev_init())
 * @param imedaddr     pointer to the memory that will be released
 * @param unmap_memory used in OpenCL, when set to 1 prior to the memory
 *                     deallocation, the memory is unmapped.
 */
void hm_dev_free(void *device_info, void *iaddr, int unmap_memory)
{
	if (unmap_memory || iaddr == NULL)  // dev_data
		return;                         /* FIXME: what about the alloc mem leak? */
#ifdef OCL_USE_CL2_SVM_ARGS
	ocl_gpu_t *dev = (ocl_gpu_t *) device_info;
	if (dev->svm_coarse)
	{
		if (unmap_memory)    /* TODO: should we do in on fine-grain SVM?? */
			clEnqueueSVMUnmap(dev->commands, iaddr, 0, NULL, NULL);
		clSVMFree(dev->context, iaddr);
	}
	else
#endif
		clReleaseMemObject((cl_mem) iaddr);
}


/**
 * Frees a global variable allocated with hm_dev_init_alloc_global
 *
 * @param device_info  the device (the pointer returned by hm_dev_init())
 * @param iaddr        pointer to the memory that will be released
 * @param global_id    the ID of the global variable that will be released
 */
void hm_dev_free_global(void *device_info, void *iaddr, int global_id)
{
	ocl_gpu_t *dev = (ocl_gpu_t *) device_info;
	hm_dev_free(dev, iaddr, 0);
}


/**
 * Transfers data from the host to a device
 *
 * @param device_info the device (the pointer returned by hm_dev_init())
 * @param hostaddr    the source memory
 * @param hostoffset  offset from hostaddr
 * @param imedaddr    the target memory (internal mediary address)
 * @param devoffset   offset from imedaddr
 * @param size        the number of bytes
 */
void hm_todev(void *device_info, void *hostaddr, size_t hostoffset,
              void *imedaddr, size_t devoffset, size_t size)
{
	ocl_gpu_t *dev = (ocl_gpu_t *) device_info;
	
#ifdef OCL_USE_CL2_SVM_ARGS
	if (dev->svm_coarse)
	{
		if (!dev->svm_fine_buffer)
			/* 1. Map shared virtual memory for writing */
			if (clEnqueueSVMMap(dev->commands, CL_TRUE, CL_MAP_WRITE,
			                   imedaddr + devoffset, size, 0, NULL, NULL) != CL_SUCCESS)
			{
				fprintf(stderr, "[%s]: %s failed on clEnqueueSVMMap().\n",
				        modulename, __func__);
				exit(1);
			}

		/* 2. Copy data to shared virtual memory */
		memcpy(imedaddr + devoffset, hostaddr + hostoffset, size);

		if (!dev->svm_fine_buffer)
			/* 3. Unmap shared virtual memory */
			clEnqueueSVMUnmap(dev->commands, imedaddr + devoffset, 0, NULL, NULL);
	}
	else
#endif
	{
		if (ocl_use_host_ptr && isAlignedForCL(hostaddr, 0))
		{
			/* Well, it is not clear from the specs but seems like buffers created 
			 * using CL_MEM_USE_HOST_PTR should be mapped for the host to read/write. 
			 * I believe that this is only makes sense for buffers created with 
			 * CL_MEM_ALLOC_HOST_PTR. From experiments, Intel Xe and ATI Radeon R9
			 * do not need to have the buffer mapped but NVIDIA GT730 does (otherwise 
			 * the host memory is not updated with the device contents---which in my
			 * opinion also reveals that no zero-copy is implemented). Anyway, we will 
			 * map it in order to be on the safe side.
			 *
			 * Most probably the mapping should be done for all kernel arguments
			 * after kernel execution; this here won't work if there are multiple
			 * reads/writes to the same buffer (after the #target).
			 */
			void  *tmp;
			cl_int err = clGetMemObjectInfo((cl_mem) imedaddr, CL_MEM_HOST_PTR, 
			                                sizeof(void *), &tmp, NULL);
			if (err == CL_SUCCESS && tmp != NULL)      /* avoid transfers then! */
			{
				if (hostaddr != clEnqueueMapBuffer(dev->commands, (cl_mem) imedaddr, 
				                                   CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 
				                                   0, size, 0, NULL, NULL, NULL))
					fprintf(stderr, "[opencl] clCreateBuffer and hostaddr differ.\n");
				clEnqueueUnmapMemObject(dev->commands, (cl_mem) imedaddr, tmp, 0,
				                        NULL, NULL);
				return;
			}
		}
		
		if (clEnqueueWriteBuffer(dev->commands, (cl_mem) imedaddr, CL_TRUE,
	       devoffset, size, hostaddr + hostoffset, 0, NULL, NULL) != CL_SUCCESS)
			fprintf(stderr, "[%s]: %s failed on clEnqueueWriteBuffer().\n",
		     modulename, __func__);
	}
}


/**
 * Transfers data from a device to the host
 *
 * @param device_info the device (the pointer returned by hm_dev_init())
 * @param hostaddr    the target memory
 * @param hostoffset  offset from hostaddr
 * @param imedaddr    the source memory (internal mediary address)
 * @param devoffset   offset from imedaddr
 * @param size        the number of bytes
 */
void hm_fromdev(void *device_info, void *hostaddr, size_t hostoffset,
                void *imedaddr, size_t devoffset, size_t size)
{
	ocl_gpu_t *dev = (ocl_gpu_t *) device_info;

#ifdef OCL_USE_CL2_SVM_ARGS
	if (dev->svm_coarse)
	{
		if (!dev->svm_fine_buffer)
			/* 1. Map shared virtual memory for writing */
			if (clEnqueueSVMMap(dev->commands, CL_TRUE, CL_MAP_READ,
			                   imedaddr + devoffset, size, 0, NULL, NULL) != CL_SUCCESS)
			{
				fprintf(stderr, "[%s]: %s failed on clEnqueueSVMMap().\n",
				        modulename, __func__);
				exit(1);
			}

		/* 2. Copy data from shared virtual memory */
		memcpy(hostaddr + hostoffset, imedaddr + devoffset, size);

		if (!dev->svm_fine_buffer)
			/* 3. Unmap shared virtual memory */
			clEnqueueSVMUnmap(dev->commands, imedaddr + devoffset, 0, NULL, NULL);
	}
	else
#endif
	{
		if (ocl_use_host_ptr && isAlignedForCL(hostaddr, 0))
		{
			/* See the comments in hm_todev() */
			void  *tmp;
			cl_int err = clGetMemObjectInfo((cl_mem) imedaddr, CL_MEM_HOST_PTR, 
			                                sizeof(void *), &tmp, NULL);
			if (err == CL_SUCCESS && tmp != NULL)      /* avoid transfers then! */
			{
				if (hostaddr != clEnqueueMapBuffer(dev->commands, (cl_mem) imedaddr, 
				                                   CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 
				                                   0, size, 0, NULL, NULL, NULL))
					fprintf(stderr, "[opencl] clCreateBuffer and hostaddr differ.\n");
				clEnqueueUnmapMemObject(dev->commands, (cl_mem) imedaddr, tmp, 0, 
				                        NULL, NULL);
				return;
			}
		}
		
		if (clEnqueueReadBuffer(dev->commands, (cl_mem) imedaddr, CL_TRUE,
		       devoffset, size, hostaddr + hostoffset, 0, NULL, NULL) != CL_SUCCESS)
			fprintf(stderr, "[%s]: %s failed on clEnqueueReadBuffer().\n",
			     modulename, __func__);
	}
}


/**
 * Given an internal mediary address, it returns a usable mediary address
 *
 * @param device_info the device (the pointer returned by hm_dev_init())
 * @param imedaddr    allocated memory from hm_dev_alloc
 *
 * @return usable mediary address to pass to a kernel
 */
void *hm_imed2umed_addr(void *device_info, void *imedaddr)
{
	return imedaddr;   /* internal = usable mediary address */
}


/**
 * Given a usable mediary address, it returns the internal mediary address
 *
 * @param device_info the device (the pointer returned by hm_dev_init())
 * @param umedaddr    allocated memory from hm_dev_alloc
 *
 * @return internal mediary address to be used by ORT
 */
void *hm_umed2imed_addr(void *device_info, void *umedaddr)
{
	return umedaddr;   /* internal = usable mediary address */
}


/* new-hostpart-func.sh:funcdef */
