/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* oclutils -- handy OpenCL utilities */

#include "oclutils.h"

#define MAX_OPENCL_PLATFORMS 10     /* Arbitrary... */
#define MAX_OPENCL_GPUS      100


cl_uint ocl_version_from_string(char *str)
{
	int min, maj;
	maj = atoi(str);   /* <major>.<minor>... */
	str = strchr(str, '.');
	min = str ? atoi(str + 1) : 0;
	return MAKEVER(maj,min,0);
}


/**
 * @brief Finds all OpenCL C versions supported by the device compiler and
 *        determines the highest one.
 * 
 * @param dev = the OpenCL device ID
 * @param vers = the array to store the supported versions (can be NULL)
 * @param nver = the number of supported versions (can be NULL)
 * @return the highest supported version; if 0, no compilation is supported.
 */
cl_uint ocl_openclc_versions(cl_device_id dev, cl_uint *vers, int *nver)
{
/* Just in case we are on older hardware */
#ifndef CL_DEVICE_OPENCL_C_ALL_VERSIONS
	/* From the Kronos OpenCL 3.0 header */
	#define CL_DEVICE_OPENCL_C_ALL_VERSIONS 0x1066
	#define CL_NAME_VERSION_MAX_NAME_SIZE 64
	typedef struct _cl_name_version {
			cl_uint version;   /* cl_version actually, which is cl_unit */
			char    name[CL_NAME_VERSION_MAX_NAME_SIZE];
		} cl_name_version;
#endif
	char _querybuf[OCL_QUERY_SIZE];
	size_t nbytes;

	if (clGetDeviceInfo(dev, CL_DEVICE_OPENCL_C_ALL_VERSIONS, OCL_QUERY_SIZE,
	                    _querybuf, &nbytes) != CL_SUCCESS || nbytes == 0)
	{
		/* OpenCL < 3.0; query deprecated in v3.0 */
		if (clGetDeviceInfo(dev, CL_DEVICE_OPENCL_C_VERSION, OCL_QUERY_SIZE, 
		                    _querybuf, NULL) != CL_SUCCESS || *_querybuf == 0)
			return 0;
		else
		{
			if (vers) *vers = ocl_version_from_string(_querybuf + 9);
			if (nver) *nver = 1;
			return ocl_version_from_string(_querybuf + 9);
		}
	}
	else
	{
		/* OpenCL 3.0+ */
		cl_uint maxver = 0;
		cl_name_version *allclcvers = (cl_name_version *) _querybuf;
		int nv;

		nv = nbytes/sizeof(cl_name_version);
		if (nv > MAX_OPENCL_C_VERSIONS)       /* put a hard limit */
			nv = MAX_OPENCL_C_VERSIONS;
		if (nver) *nver = nv;
		for (--nv; nv >= 0; nv--)
		{
			if (vers) vers[nv] = allclcvers[nv].version;
			if (allclcvers[nv].version > maxver)
				maxver = allclcvers[nv].version;
		}
		return maxver;
	}
}


/**
 * Finds each accessible OpenCL (>= 1.2) GPU device and calls the user func.
 * We skip devices that do not have an OpenCL C compiler.
 * @param platidx  the index of the platform, or < 0 if all patforms needed
 * @param callback the user function to call for each discovered GPU; the
 *                 parameters are our sequetial id, the CL device id and the
 *                 CL platform id; it can be NULL. If a callback returns
 *                 non-zero, the iterator stops.
 * @returns the number of devices found or < 0 on error
 */
int ocl_iterate_gpus(int platidx,
                     int (*callback)(int, cl_device_id, cl_platform_id))
{
	cl_uint        did, pid = 0, numplats, numdevs;
	cl_bool        dev_bool;
	cl_platform_id plats[MAX_OPENCL_PLATFORMS];
	cl_device_id   devs[MAX_OPENCL_GPUS];
	int            oclgpus = 0;
//	char           devinfo[OCL_QUERY_SIZE];

	if (clGetPlatformIDs(0, 0, &numplats) != CL_SUCCESS) return -1;
	if (numplats > MAX_OPENCL_PLATFORMS)
	{
		numplats = MAX_OPENCL_PLATFORMS;
		fprintf(stderr, "[opencl module]: excessive number of platforms\n");
	}
	if (clGetPlatformIDs(numplats, plats, NULL) != CL_SUCCESS) return -1;

	if (platidx <= 0)   /* No specific platform requested */
		pid = 0;                /* starting index */
	else
	{
		pid = platidx;          /* starting index */
		numplats = platidx + 1; /* ending index */
	}
	for (; pid < numplats; pid++)
		if (clGetDeviceIDs(plats[pid], CL_DEVICE_TYPE_GPU,
		                   MAX_OPENCL_GPUS, devs, &numdevs) == CL_SUCCESS)
			for (did = 0; did < numdevs; did++)
			{
				/* Skip GPUs that do not have an OpenCL C compiler */
				if (clGetDeviceInfo(devs[did], CL_DEVICE_COMPILER_AVAILABLE, 
				                    sizeof(cl_bool), &dev_bool, NULL) != CL_SUCCESS
				       || dev_bool == CL_FALSE)
					continue;
				/* Count only GPUs that suppport OpenCL C >= 1.2 */
				if (ocl_openclc_versions(devs[did], NULL, NULL) >= MAKEVER(1,2,0))
				{
					if (callback)
					{
						if ((*callback)(oclgpus, devs[did], plats[pid]))
							return (oclgpus + 1);    /* We should stop */
					}
					oclgpus++;
				}
			};
	return oclgpus;
}


/* Callback machinery for ocl_discover_ith_gpu() */
static int _d_gpuid, _d_success;
static cl_device_id   *_d_cldev;
static cl_platform_id *_d_clplat;
static
int _discover_ith_gpu_cb(int gpuid, cl_device_id cldev, cl_platform_id clplat)
{
	if (_d_gpuid == gpuid)
	{
		if (_d_cldev) *_d_cldev = cldev;
		if (_d_clplat) *_d_clplat = clplat;
		return (_d_success = 1);   /* stop here */
	}
	return 0;
}


/**
 * Finds the gpuid-th accessible OpenCL (>= 1.2) GPU device and
 * returns it on cldev/clplat.
 * @param gpuid  the sequential id of the GPU
 * @param cldev  the discovered CL device id
 * @param clplat the discovered CL platform id
 * @return 0 if OK, non-zero on error
 */
int ocl_discover_ith_gpu(int gpuid, cl_device_id *cldev,
                         cl_platform_id *clplat)
{
	_d_gpuid = gpuid;
	_d_cldev = cldev;
	_d_clplat = clplat;
	_d_success = 0;
	ocl_iterate_gpus(-1, _discover_ith_gpu_cb);
	return !_d_success;
}


/* Returns a freeable string containing the source code */
char *ocl_read_src(char *path)
{
	FILE  *fp;
	size_t fsz;
	char  *buf;

	if ((fp = fopen(path, "r")) == NULL)
		return NULL;
	fseek(fp, 0L, SEEK_END);     /* Seek to the end to get the size */
	if ((fsz = ftell(fp)) < 0)
		return NULL;

	if ((buf = malloc((fsz + 1) * sizeof(char))) == NULL)
		return NULL;
	rewind(fp);

	if (fsz != fread(buf, 1, fsz, fp))
	{
		free(buf);
		buf = NULL;
	}
	else
		buf[fsz] = '\0';   /* Terminate */

	fclose(fp);
	return buf;
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                             *
 *    INFO/DEBUG                                               *
 *                                                             *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


char *ocl_errstr(cl_int err)
{
#define CaseReturnString(x) case x: return #x;

	switch (err)
	{
			CaseReturnString(CL_SUCCESS)
			CaseReturnString(CL_DEVICE_NOT_FOUND)
			CaseReturnString(CL_DEVICE_NOT_AVAILABLE)
			CaseReturnString(CL_COMPILER_NOT_AVAILABLE)
			CaseReturnString(CL_MEM_OBJECT_ALLOCATION_FAILURE)
			CaseReturnString(CL_OUT_OF_RESOURCES)
			CaseReturnString(CL_OUT_OF_HOST_MEMORY)
			CaseReturnString(CL_PROFILING_INFO_NOT_AVAILABLE)
			CaseReturnString(CL_MEM_COPY_OVERLAP)
			CaseReturnString(CL_IMAGE_FORMAT_MISMATCH)
			CaseReturnString(CL_IMAGE_FORMAT_NOT_SUPPORTED)
			CaseReturnString(CL_BUILD_PROGRAM_FAILURE)
			CaseReturnString(CL_MAP_FAILURE)
			CaseReturnString(CL_MISALIGNED_SUB_BUFFER_OFFSET)
			CaseReturnString(CL_COMPILE_PROGRAM_FAILURE)
			CaseReturnString(CL_LINKER_NOT_AVAILABLE)
			CaseReturnString(CL_LINK_PROGRAM_FAILURE)
			CaseReturnString(CL_DEVICE_PARTITION_FAILED)
			CaseReturnString(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)
			CaseReturnString(CL_INVALID_VALUE)
			CaseReturnString(CL_INVALID_DEVICE_TYPE)
			CaseReturnString(CL_INVALID_PLATFORM)
			CaseReturnString(CL_INVALID_DEVICE)
			CaseReturnString(CL_INVALID_CONTEXT)
			CaseReturnString(CL_INVALID_QUEUE_PROPERTIES)
			CaseReturnString(CL_INVALID_COMMAND_QUEUE)
			CaseReturnString(CL_INVALID_HOST_PTR)
			CaseReturnString(CL_INVALID_MEM_OBJECT)
			CaseReturnString(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
			CaseReturnString(CL_INVALID_IMAGE_SIZE)
			CaseReturnString(CL_INVALID_SAMPLER)
			CaseReturnString(CL_INVALID_BINARY)
			CaseReturnString(CL_INVALID_BUILD_OPTIONS)
			CaseReturnString(CL_INVALID_PROGRAM)
			CaseReturnString(CL_INVALID_PROGRAM_EXECUTABLE)
			CaseReturnString(CL_INVALID_KERNEL_NAME)
			CaseReturnString(CL_INVALID_KERNEL_DEFINITION)
			CaseReturnString(CL_INVALID_KERNEL)
			CaseReturnString(CL_INVALID_ARG_INDEX)
			CaseReturnString(CL_INVALID_ARG_VALUE)
			CaseReturnString(CL_INVALID_ARG_SIZE)
			CaseReturnString(CL_INVALID_KERNEL_ARGS)
			CaseReturnString(CL_INVALID_WORK_DIMENSION)
			CaseReturnString(CL_INVALID_WORK_GROUP_SIZE)
			CaseReturnString(CL_INVALID_WORK_ITEM_SIZE)
			CaseReturnString(CL_INVALID_GLOBAL_OFFSET)
			CaseReturnString(CL_INVALID_EVENT_WAIT_LIST)
			CaseReturnString(CL_INVALID_EVENT)
			CaseReturnString(CL_INVALID_OPERATION)
			CaseReturnString(CL_INVALID_GL_OBJECT)
			CaseReturnString(CL_INVALID_BUFFER_SIZE)
			CaseReturnString(CL_INVALID_MIP_LEVEL)
			CaseReturnString(CL_INVALID_GLOBAL_WORK_SIZE)
			CaseReturnString(CL_INVALID_PROPERTY)
			CaseReturnString(CL_INVALID_IMAGE_DESCRIPTOR)
			CaseReturnString(CL_INVALID_COMPILER_OPTIONS)
			CaseReturnString(CL_INVALID_LINKER_OPTIONS)
			CaseReturnString(CL_INVALID_DEVICE_PARTITION_COUNT)
		default: return ("Unknown OpenCL error code");
	}
}


#define _bintype(t) ((t==CL_PROGRAM_BINARY_TYPE_NONE) ? "none" :\
                     (t==CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) ? "compiled" :\
                     (t==CL_PROGRAM_BINARY_TYPE_LIBRARY) ? "library" :\
                     (t==CL_PROGRAM_BINARY_TYPE_EXECUTABLE) ? "executable" :\
                     "unknown !!??")
#define _bldstat(t) ((t==CL_BUILD_SUCCESS) ? "successful" :\
                     (t==CL_BUILD_NONE) ? "not built" :\
                     (t==CL_BUILD_ERROR) ? "error" :\
                     (t==CL_BUILD_IN_PROGRESS) ? "in progress" :\
                     "unknown !!??")

void ocl_show_clprogram_info(cl_program program, cl_device_id cldev, char *msg)
{
	char buffer[OCL_QUERY_SIZE];
	cl_program_binary_type  pbt;
	cl_build_status bs;
	size_t size;

	fprintf(stderr, "clProgram info for %s:\n", msg);
	clGetProgramBuildInfo(program, cldev, CL_PROGRAM_BINARY_TYPE,
	                      sizeof(pbt), &pbt, NULL);
	fprintf(stderr, "  CL_PROGRAM_BINARY_TYPE   = %s\n", _bintype(pbt));
	clGetProgramBuildInfo(program, cldev, CL_PROGRAM_BUILD_STATUS,
	                      sizeof(bs), &bs, NULL);
	fprintf(stderr, "  CL_PROGRAM_BUILD_STATUS  = %s\n", _bldstat(bs));
	clGetProgramBuildInfo(program, cldev, CL_PROGRAM_BUILD_OPTIONS,
	                      OCL_QUERY_SIZE, buffer, NULL);
	fprintf(stderr, "  CL_PROGRAM_BUILD_OPTIONS = %s\n", buffer);
	clGetProgramBuildInfo(program, cldev, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
	if (size >= OCL_QUERY_SIZE)
	{
		char *log = malloc(size+1);
		clGetProgramBuildInfo(program, cldev, CL_PROGRAM_BUILD_LOG, size, log,NULL);
		fprintf(stderr, "  CL_PROGRAM_BUILD_LOG     =\n--------\n%s---------\n",log);
		free(log);
	}
	else
	{
		clGetProgramBuildInfo(program, cldev, CL_PROGRAM_BUILD_LOG, 
		                      OCL_QUERY_SIZE, buffer,NULL);
		fprintf(stderr, "  CL_PROGRAM_BUILD_LOG     =\n--------\n%s---------\n",
		                buffer);
	}
	
#ifdef CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE
	if (ocl_openclc_versions(cldev,NULL,NULL) >= MAKEVER(2,0,0))
	{
		size_t mem;
		clGetProgramBuildInfo(program, cldev,
		                      CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE,
		                      sizeof(size_t), &mem, NULL);
		fprintf(stderr,"  CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE = "
		               "%lu bytes\n", mem);
	}
#endif
}


void ocl_show_clkernel_info(cl_kernel k, cl_device_id cldev, char *msg)
{
	cl_ulong mem;
	size_t   n;
	
	fprintf(stderr, "clKernel info for %s:\n", msg);
	clGetKernelWorkGroupInfo(k, cldev, CL_KERNEL_LOCAL_MEM_SIZE, 
	                         sizeof(cl_ulong), &mem, NULL);
	fprintf(stderr, "  CL_KERNEL_LOCAL_MEM_SIZE   = %lu\n", mem);
	clGetKernelWorkGroupInfo(k, cldev, CL_KERNEL_PRIVATE_MEM_SIZE, 
	                         sizeof(cl_ulong), &mem, NULL);
	fprintf(stderr, "  CL_KERNEL_PRIVATE_MEM_SIZE = %lu\n", mem);
	clGetKernelWorkGroupInfo(k,cldev,CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 
	                         sizeof(size_t), &n, NULL);
	fprintf(stderr, "  CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE  = %lu\n", n);
	clGetKernelWorkGroupInfo(k, cldev, CL_KERNEL_WORK_GROUP_SIZE, 
	                         sizeof(size_t), &n, NULL);
	fprintf(stderr, "  CL_KERNEL_WORK_GROUP_SIZE  = %lu\n", n);
}
