/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

#ifndef __OCLGPUDEV_H__
#define __OCLGPUDEV_H__

#include "oclutils.h"

/* If defined, coarse-grained buffer SVM will be used for the kernel arguments.
 * It is safer to set to 0 since no noticeable improvement could be seen;
 * on the contrary, when moving a 32MB array back and forth, we witnessed
 *  - 4x slowdown (!) @ NVIDIA GT730
 *  - 3x (2x) slowdown @ ATI Radeon R9 280 for coarse (fine) grained allocations
 *
 * Here is the story:
 *  - OpenCL SVM is basically for having data **allocated by the OpenCL 
 *    runtime** shared between the host and the GPU. However, this is of not 
 *    much use to us since the actual data have already been allocated by the 
 *    host program. Consequently, even is SVM is supported, we still need 
 *    to suffer copying the data to the GPU.
 *  - What would be useful to us is to have already allocated data (by the 
 *    host) be used directly by the GPU. In OpenCL 2.0 this is possible 
 *    if fine-grained **system** SVM is supported (and even that may not be 
 *    enough). And then there is OCL_USE_HOST_PTR below.
 */
#ifndef SVMFUNCS_UNKNOWN
	//#define OCL_USE_CL2_SVM_ARGS
#endif

/* If defined, then buffers passed as kernel arguments will be created using 
 * host memory (CL_MEM_USE_HOST_PTR), in an attempt to achieve zero-copy 
 * transfers if the implementation supports it and if requirements are 
 * fullfilled. In particular, the memory block must be page aligned (i.e.
 * starting at a multiple of 4096) and its size must be a multiple of 64 
 * (actually of the cache line size). Clearly, for this to work, the compiler 
 * (or the app programmer) should take care of allocating large data objects 
 * in a page-aligned manner.
 * The difference in performance can be significant; here are some results
 * when offloading an empty kernel with an aligned 256MB map()ed array:
 * ATI Radeon R9 280: 
 *   With CL_MEM_USE_HOST_PTR: 0.100 (to), 0.019 (from), 0.019 (tofrom)
 *     No CL_MEM_USE_HOST_PTR: 0.137 (to), 0.094 (from), 0.160 (tofrom)
 * Intel Iris Xe:
 *   With CL_MEM_USE_HOST_PTR: 0.024 (to), 0.021 (from), 0.021 (tofrom)
 *     No CL_MEM_USE_HOST_PTR: 0.077 (to), 0.075 (from), 0.092 (tofrom)
 * However, this is not always the case. The same test on an NVIDIA GT730
 * showed ~2x slowdown:
 *   With CL_MEM_USE_HOST_PTR: 0.350 (to), 0.341 (from), 0.503 (tofrom)
 *     No CL_MEM_USE_HOST_PTR: 0.174 (to), 0.201 (from), 0.269 (tofrom)
 *
 * Correctness (especially offsets) has not been exhaustively tested yet. 
 */
// #define OCL_USE_HOST_PTR

/* The number of OpenCL programs to cache */
#define OCL_PCACHE_SIZE 30

typedef struct {                   /* for caching created programs */
		void *(*hostfunc)(void *);     /* the host function */
		char *filepath;                /* the full path of the kernel file */
		char *filepath_sources;        /* the full path of the kernel file sources */
		cl_program program;            /* the created CL program */
		void *lock;                    /* to protect argument passing */
	} ocl_program_t;         
	
typedef struct
{
	cl_platform_id      clplatform;
	cl_device_id        cldevice;          // compute device id
	cl_uint             pver;              // supported OpenCL version (platform)
	cl_uint             dver;              // supported OpenCL version (device/hw)
	cl_uint             cvers[MAX_OPENCL_C_VERSIONS];
	int                 num_cvers;
	cl_uint             device_max_computeinits;
	size_t              device_team_max_threads;
	size_t              device_optimal_team_threads;
	cl_ulong            device_global_mem_size;
	cl_ulong            device_local_mem_size;
	char               *device_name;
	char               *device_vendor;
	char               *device_driver_version; // vendor specific
	int                 svm_coarse;
	int                 svm_fine_buffer;
	int                 svm_fine_system;
	int                 unified_memory_12; // 1.2 unified memory (deprecated)
	cl_bool             haswgcollectives;
	cl_bool             hasgenericmemspace;
	cl_device_fp_config doublesupport;
	cl_context          context;           // compute context
	cl_command_queue    commands;          // compute command queue
	ort_icvs_t          dev_icvs;          // ICV initial values
   
	cl_program          devrt;             // cache the device runtime
	ocl_program_t       pcache[OCL_PCACHE_SIZE];   // also keep a program cache
	int pcentries;                         // If filled, do FIFO replacememnt
	int sharedspace;                   // Shared address space with the host (0/1)
	int devid;                             // device id (0, 1, ...)
	devicestatus_e status;
} ocl_gpu_t;


extern int  ocl_num_gpus;
extern bool ocl_use_host_ptr;          /* try to achieve zero-copy buffers */
extern enum xlb_e { XLB_WARN, XLB_NOWARN, XLB_EXIT, XLB_EXITMUTE } 
       ocl_xceed_limits_behavior;

/* Pointers to lock functions of the host runtime */
extern void (*init_lock)(void **lock, int type);
extern void (*lock)(void **lock);
extern void (*unlock)(void **lock);

void       oclgd_init(bool full), oclgd_finish();  /* init/finalize layer */

ocl_gpu_t *oclgd_get(int dev_id);         /* get the i-th device */
int        oclgd_prepare(ocl_gpu_t *d, cl_device_id, cl_platform_id); /* init */
void       olcgd_close(ocl_gpu_t *dev);   /* relinquish device */
int        ocl_opencl_c_majver_supported(ocl_gpu_t *dev, int majver);
ocl_program_t *oclgd_get_cached_program(ocl_gpu_t *d, void *(*hostfunc)(void*));
ocl_program_t *oclgd_add_cached_program(ocl_gpu_t *d, void *(*hostfunc)(void *),
                                        char *filepath, char *filepath_sources, 
                                        cl_program program);
cl_program oclgd_load_devpart(ocl_gpu_t *device_info, char *libdirpath, 
                              char *MALflavor);
void       oclgd_pass_args_to_kernel(ocl_gpu_t *dev, cl_kernel kernel, 
                                   int *num_args, void **args, cl_mem xtrainfo);
int        ocldg_setup_threading(ocl_gpu_t *, cl_kernel, int *nthr, int *thlim);
int        oclgd_kernel_execute(ocl_gpu_t *dev, cl_kernel kernel,
                                int min_teams, int max_teams, 
                                int num_threads, int thread_limit);

#endif /* __OCLGPUDEV_H__ */
