/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

#ifndef __CUDAUTILS_H__
#define __CUDAUTILS_H__

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>

#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/fb.h>
#include <sys/mman.h>

#include <stdint.h>
#include <limits.h>
#include <cuda.h>
#include <errno.h>
#include <stdarg.h>
#include <assert.h>
#include <pthread.h>
#include "../../../../common/set.h"
#include "modmain.h"
#include "stddefs.h"

#include <sys/syscall.h>

/* Uncomment/comment to turn caching on/off */
#define USE_CACHING

#define CUDA_CACHE_SIZE 256

#define CUDA_OPTIMAL_NUMTHREADS 128
#define CUDA_SHMEM_SIZE         0

#define cuda_do(err)  __check_cuda_errors (__FUNCTION__, err, __FILE__, __LINE__, 0, NULL)
#define cuda_do_crit(err,cdev) __check_cuda_errors (__FUNCTION__, err, __FILE__, __LINE__, 1, cdev)

SET_TYPE_DEFINE(hostmed_addrmap, void*, CUdeviceptr, 1031);

typedef enum num_args_
{
	ARGS_NUMDECL = 0, ARGS_NUMFIP = 1, ARGS_NUMMAPPED = 2
} num_args_e;

typedef struct 
{
	unsigned int X;
	unsigned int Y;
	unsigned int Z;
} cuda_dim_t;

typedef struct 
{
	char       *name;        /* Kernel name */
	char       *filename;    /* Kernel filename */
	CUlinkState linkstate;   /* JIT Linking state */
	CUfunction 	function;    /* CUDA function (kernel) to be executed */
	cuda_dim_t 	grid_dims;   /* Kernel execution grid dimensions */
	cuda_dim_t 	block_dims;  /* Kernel execution block dimensions */
	void      **args;        /* Kernel arguments */
	int         nargs;       /* Number of kernel arguments */
	void       *(*host_func)(void *); /* Host function */
	char       *libdevpart;
} cuda_kernel_t;

#if defined(USE_CACHING)
typedef struct 
{
	int nkernels;                  /* Number of cached kernels */
	cuda_kernel_t kernels[CUDA_CACHE_SIZE];   /* CUDA kernels */
} cuda_kernel_cache_t;
#endif

typedef struct 
{
	int    id;
	char   device_name[256];               /* Device name */
	int    device_sm_version_major;        /* CUDA compute capability version (major) */
	int    device_sm_version_minor;        /* CUDA compute capability version (minor) */
	int    device_num_mp;                  /* Number of device multiprocessors */
	int    device_num_mp_cores;            /* Number of cores per multiprocessor */
	int    device_max_thread_block_size;   /* Maximum threads per block */
	int    device_max_blocks_per_grid;     /* Maximum blocks per grid */
	size_t device_global_mem_size;         /* Global memory size */
	int    num_launched_kernels;           /* Number of succesfully launched kernels */
	char  *arch_name;                      /* Architecture name */
	ort_icvs_t  dev_icvs;                  /* ICV initial values */
	set(hostmed_addrmap) addr_map;         /* Maps host addresses to CUDA mediary ones */

	/* CUDA-related fields */
	CUdevice             device;           /* CUDA device (typically a GPU) */
	CUcontext           *context;          /* Thread-specific CUDA context */
	CUstream            *stream;           /* Thread-specific CUDA stream */
	pthread_key_t        ctx_key;          /* POSIX key for thread-specific CUDA context */
	pthread_key_t        stream_key;       /* POSIX key for thread-specific CUDA stream */
	CUmodule             module;           /* CUDA module */			
#if defined(USE_CACHING)
	cuda_kernel_cache_t *kernel_cache;     /* CUDA kernel cache */
	pthread_key_t 		 cache_key;
#else
	cuda_kernel_t		*kernel;
#endif
} cuda_dev_t;

typedef struct
{
	int max_teams;
	int max_threads;
	int nthr;
	int thread_limit;
	int nprocs;
} cuda_thrinfo_t;

extern char *modulename;
extern int available_cuda_gpus;
extern cuda_dev_t *cuda_gpus;

/* Generic utility functions */
extern void __dbg(const char * format, ...);
extern void __check_cuda_errors(const char *func, CUresult err, const char *file, const int line,
	                            int destroy_context, cuda_dev_t *cuda_gpu);

extern int  cuda_get_num_gpus(void);
extern void cuda_init(void);
extern void cuda_clear(void);

/* Memory-related functions */
extern void cuda_memcpy_const(cuda_dev_t *cuda_gpu, char *symbol, void *src, size_t bytes);

/* Kernel-related functions */
extern int  cuda_kernel_cached_create(cuda_dev_t *cuda_gpu, void *(*host_func)(void *),
	                                  char *kernel_filename, char *kernel_name);
extern void cuda_kernel_set_args(cuda_dev_t *cuda_gpu, void *devdata, size_t devdata_size,
                                 int kernel_id, int *num_args, void **args);

extern void cuda_kernel_launch(cuda_dev_t *cuda_gpu, int kernel_id, int num_teams, 
	                           int num_threads, 
							   unsigned long long teamdims, unsigned long long thrdims, 
							   int thread_limit);

extern void cuda_cache_test_init(cuda_dev_t *cuda_gpu);
extern void cuda_cache_test_destroy(cuda_dev_t *cuda_gpu);

#endif /* __CUDAUTILS_H__ */
