/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* This is the host-side part of the module; it should be compiled
 * to a shared library. It is dynamically linked to the host runtime at runtime.
 */

// #define DBGPRN_FORCE
// #define DBGPRN_BLOCK
#define DBGPRN_FILTER DBG_DEVICES

#include "cudautils.h"
#include "context.h"
#include "stream.h"
#include "stddefs.h"
#include <cudaProfiler.h>
#include "set.h"
#include "ptx.h"

SET_TYPE_IMPLEMENT(hostmed_addrmap)

int hm_sharedspace = 0;     /* No sharing space with the host */
int hm_unified_medaddr = 1; /* Internal and usable mediary addresses are the same */

static int initialized_gpus; /* Number of initialized GPUs */

/* Pointers to lock functions of the host runtime
 */
void (*init_lock)(void **lock, int type);
void (*lock)(void **lock);
void (*unlock)(void **lock);
int  (*hyield)(void);
int  (*strprintf)(str s, char *fmt, ...);

char *modulename;

#if defined(ENABLE_KERNEL_BUNDLING) && (KERNEL_BUNDLING_MODE == BUNDLE_SOURCES)

/* Bundling using mode 0 (sources) requires creating & compiling the
 * kernel file from within the application
 */
static void create_and_compile_kernel_file(char *kernel_filename_prefix)
{
	char kernel_source[256];
	kernel_t *entry = NULL;

	sprintf(kernel_source, "%s-%s.c", kernel_filename_prefix, 
		modulename);

	entry = kerntab_search(kernel_source);
	if (!entry)
	{
		fprintf(stderr, "Could not retrieve kernel %s from kernel table; exiting.\n",
		                kernel_source);
		exit(EXIT_FAILURE);
	}

	kerntab_create_kernelfile(entry);
	kerntab_compile_kernelfile(entry);
}

#endif

/**
 * Host passes the name of the module
 *
 * @param modname the name of the module
 *
 */
void  hm_set_module_name(char *modname)
{
	modulename = strdup(modname ? modname : "noname");
}

/**
 * Calculates the number of available devices supported by this module
 *
 * @return number of devices
 */
int hm_get_num_devices(void)
{
	return cuda_get_num_gpus();
}


/**
 * Prints information for this module and its available devices, 
 * to a dedicated string (info). The information is printed using the 
 * registered strprintf function.
 * While the devices this module serves are numbered starting from 0 (local
 * device id), the global device ids are set by ORT; the devid_offset parameter
 * gives the global device id of local device 0.
 *
 * @param devid_offset  the global id of the 1st device served by this module
 * @param info          the string which all the information will be printed to
 */
void hm_print_information(int devid_offset, str info)
{
	cuda_dev_t *cuda_gpu = NULL;
	int i = 0;

	cuda_init();

	for (i = 0; i < available_cuda_gpus; i++)
	{	
		cuda_gpu = &(cuda_gpus[i]);
		strprintf(info, "OMPi CUDA device module.\n");
		strprintf(info, "Available devices : %d\n\n", available_cuda_gpus);
		strprintf(info, "device id < %d > { \n", devid_offset + i);
		strprintf(info, "  name: %s (SM v%d.%d)\n", cuda_gpu->device_name, 
		                cuda_gpu->device_sm_version_major, 
		                cuda_gpu->device_sm_version_minor);
		strprintf(info, "  %d multiprocessors\n", 
		                cuda_gpu->device_num_mp);
		strprintf(info, "  %d cores per multiprocessor\n", 
		                cuda_gpu->device_num_mp_cores);
		strprintf(info, "  %d cores in total\n", 
		                cuda_gpu->device_num_mp * cuda_gpu->device_num_mp_cores);
		strprintf(info, "  %d maximum thread block size\n", 
		                cuda_gpu->device_max_thread_block_size);
		strprintf(info, "  %llu Kbytes of device global memory\n", 
		                (unsigned long long) cuda_gpu->device_global_mem_size/1024);
		strprintf(info, "}\n");
	}

	cuda_clear();
}


/**
 * Registers host runtime functions (currently it registers functions for locks)
 *
 * @param init_lock_in pointer to the function used for initializing a lock.
 *                     It's parameters are the address of a "void *" variable
 *                     and one of the "ORT_LOCK_*" defines denoting the type of
 *                     the lock
 * @param lock_in      pointer to the function used for acquiring a lock
 * @param unlock_in    pointer to the function used for releasing a lock
 * @param hyield_in    pointer to the function used for thread yield
 */
void hm_register_ee_calls(void (*init_lock_in)(void **lock, int type),
                          void (*lock_in)(void **lock),
                          void (*unlock_in)(void **lock),
                          int  (*hyield_in)(void))
{
	init_lock = init_lock_in;
	lock      = lock_in;
	unlock    = unlock_in;
	hyield    = hyield_in;
}


/**
 * Registers host runtime functions (currently it registers functions for locks)
 *
 * @param str_printf_in pointer to the function used for printing to a string
 */
void hm_register_str_printf(int  (*str_printf_in)(str s, char *fmt, ...))
{
	strprintf = str_printf_in;
}


/**
 * Initializes a device
 *
 * @param dev_num the (local) id of the device to initialize
 *                (0 <= dev_num < hm_get_num_devices())
 * @param ort_icv Pointer to struct with
 *                initial values for the device ICVs.
 * @param argc    Pointer to main function's argc.
 * @param argv    Pointer to main function's argv.
 *
 * @return        devinfo: arbitrary pointer that will be passed back in
 *                following calls (see below).
 *                Return NULL only if it failed to initialize.
 */
void *hm_initialize(int dev_num, ort_icvs_t *ort_icv, int *argc, char ***argv)
{
	cuda_dev_t *cuda_gpu;

	DBGPRN((stderr, "%s: in\n", __FUNCTION__));

	cuda_init();
	cuda_gpu = &(cuda_gpus[dev_num]);
	
	cuda_gpu->id = dev_num;
	cuda_gpu->addr_map = set_new(hostmed_addrmap);
	cuda_gpu->num_launched_kernels = 0;
	
	if (ort_icv != NULL)
		cuda_gpu->dev_icvs = *ort_icv;
	
	cuda_ctx_test_init(cuda_gpu);
	cuProfilerStart();
	cuda_stream_test_init(cuda_gpu);

#if defined(USE_CACHING)
	cuda_cache_test_init(cuda_gpu);
#else
	cuda_gpu->kernel = (cuda_kernel_t*) malloc(sizeof(cuda_kernel_t));
#endif

	initialized_gpus++;

	DBGPRN((stderr, "%s: OK\n", __FUNCTION__));

	return (void*) cuda_gpu;
}


/**
 * Finalizes a device
 *
 * @param devinfo the pointer returned by hm_initialize()
 */
void hm_finalize(void *devinfo)
{
	CUresult err;
	cuda_dev_t *cuda_gpu = (cuda_dev_t*) devinfo;
	int i;

	DBGPRN((stderr, "%s: in\n", __FUNCTION__));
	cuProfilerStop();

	cuda_stream_test_destroy(cuda_gpu);
	DBGPRN((stderr, "%s: destroyed stream\n", __FUNCTION__));
	cuda_ctx_test_destroy(cuda_gpu);
	DBGPRN((stderr, "%s: destroyed context\n", __FUNCTION__));

#if defined(USE_CACHING)
	cuda_cache_test_destroy(cuda_gpu);
#endif

	set_free(cuda_gpu->addr_map);

	initialized_gpus--;
	if (initialized_gpus == 0)
		cuda_clear();

	DBGPRN((stderr, "%s: OK\n", __FUNCTION__));
}


/**
 * Offloads and executes a kernel file.
 *
 * @param device_info         the pointer returned by hm_initialize()
 * @param host_func pointer   to offload function on host address space
 * @param devdata pointer     to a struct containing kernel variables
 * @param decldata pointer    to a struct containing globally declared variables
 * @param kernel_filename_prefix filename of the kernel (without the suffix)
 * @param num_teams           num_teams clause from "teams" construct
 * @param num_threads         num_threads clause from combined "parallel" constructs
 * @param thread_limit        thread_limit clause from "teams" construct
 * @param teamdims            an unsigned long long that contains the
 *                            dimensions of the launched league, encoded as follows:
 *                            x: bits 0-20, y: bits 21-41, z: bits 42-62 
 * @param thrdims             an unsigned long long that contains the
 *                            dimensions of each thread team, encoded as follows:
 *                            x: bits 0-20, y: bits 21-41, z: bits 42-62 
 * @param num_args            an array that contains the number of declare variables, 
 *                            firstprivates and mapped variables
 * @param args                the addresses of all target data and target
 *                            declare variables
 *
 * NOTE: `teamdims' and `thrdims' can be decoded using the _ull_decode3 function.
 */
void hm_offload(void *device_info, void *(*host_func)(void *), void *devdata,
				void *decldata, char *kernel_filename_prefix, int num_teams,
				int num_threads, int thread_limit, 
				unsigned long long teamdims, unsigned long long thrdims, 
				int *num_args, void **args)
{
	cuda_dev_t *cuda_gpu = (cuda_dev_t *) device_info;
	cuda_thrinfo_t thrinfo;
	int devdata_size = 0;

	char kernel_filename[256];
	int new_kernel_id;
	
	if (num_teams == 0)
		num_teams = 1;
	else if (num_teams > cuda_gpu->device_max_blocks_per_grid)
		num_teams = cuda_gpu->device_max_blocks_per_grid;

	if (thread_limit <= 0)
		thread_limit = cuda_gpu->device_max_thread_block_size;

	if (num_threads < 0)
		num_threads = CUDA_OPTIMAL_NUMTHREADS;
	else if (num_threads == 0)
		num_threads = 1;
	else if (num_threads > thread_limit)
		num_threads = thread_limit;

	DBGPRN((stderr, "%s: init context\n", __FUNCTION__));

	cuda_ctx_test_init(cuda_gpu);
	cuda_stream_test_init(cuda_gpu);

#if defined(USE_CACHING)
	cuda_cache_test_init(cuda_gpu);
#endif

	DBGPRN((stderr, "%s: in - num_teams: %d - thread_limit: %d\n", 
	                __FUNCTION__, num_teams, thread_limit));

#if defined(ENABLE_KERNEL_BUNDLING) && (KERNEL_BUNDLING_MODE == BUNDLE_SOURCES)
	create_and_compile_kernel_file(kernel_filename_prefix);
#endif

	sprintf(kernel_filename, "%s-%s.%s", kernel_filename_prefix, 
		modulename, CUDA_KERNEL_EXTENSION);

	new_kernel_id = cuda_kernel_cached_create(cuda_gpu, host_func, kernel_filename, 
	                                         "_kernelFunc_");

	DBGPRN((stderr, "%s: new kernel ID %d\n", __FUNCTION__, new_kernel_id));

	if (devdata)
		devdata_size = ort_mapped_get_size(devdata);

	cuda_kernel_set_args(cuda_gpu, devdata, devdata_size, new_kernel_id, num_args, args);

	thrinfo.thread_limit = thread_limit;
	thrinfo.max_threads = cuda_gpu->device_max_thread_block_size;
	thrinfo.max_teams = cuda_gpu->device_max_blocks_per_grid;
	thrinfo.nprocs = cuda_gpu->device_num_mp_cores;
	thrinfo.nthr = num_threads;

	cuda_memcpy_const(cuda_gpu, "thrinfo", &thrinfo, 
		sizeof(cuda_thrinfo_t));
	
	/* num_threads > 1, when having parallel regions */
	cuda_kernel_launch(cuda_gpu, new_kernel_id, num_teams, 
		num_threads, teamdims, thrdims, thread_limit);
	
	DBGPRN((stderr, "%s: OK\n", __FUNCTION__));
}


/**
 * Allocates memory "on the device"
 *
 * @param device_info the pointer returned by hm_initialize()
 * @param size        the number of bytes to allocate
 * @param map_memory  used in OpenCL, when set to 1 additionaly to the memory
 *                    allocation in shared virtual address space, the memory
 *                    is mapped with read/write permissions so the host cpu
 *                    can utilize it.
 * @param hostaddr    used in MPI to allocate #declare target link variables;
 *                    you can safely ignore this argument.
 * @return            pointer to the allocated space (internal mediary address)
 */
void *hm_dev_alloc(void *device_info, size_t size, int map_memory, void *hostaddr)
{
	CUresult err;
	CUdeviceptr allocated_memory;
	cuda_dev_t *cuda_gpu = (cuda_dev_t *) device_info;
	void *mapped_memory;

	cuda_ctx_test_init(cuda_gpu);
	cuda_stream_test_init(cuda_gpu);

#if defined(USE_CACHING)
	cuda_cache_test_init(cuda_gpu);
#endif 

	DBGPRN((stderr, "%s: in\n", __FUNCTION__));

	/* Devdata and decldata are kept in host side and copied
	to device during offload */
	if (map_memory)
	{
		DBGPRN((stderr, "%s: OK (host map), size=%d\n", __FUNCTION__, size));
		return ort_mapped_alloc(size);
	}

	cuda_do(cuMemAlloc(&allocated_memory, size));
	DBGPRN((stderr, "%s: OK (device), size=%d, hostaddr = %p, allocated=%p\n", 
	                __FUNCTION__, size, hostaddr, allocated_memory));

	return (void *) allocated_memory;
}


/**
 * Allocates & initializes memory "on the device" for a global variable
 *
 * @param device_info the pointer returned by hm_initialize()
 * @param global_id   the ID of the global variable
 * @param size        the number of bytes to allocate

 * @return            pointer to the allocated space (internal mediary address)
 */
void *hm_dev_init_alloc_global(void *device_info, void *initfrom, size_t size, int global_id,
                               void *hostaddr)
{
	void *addr = hm_dev_alloc(device_info, size, 0, hostaddr);
	hm_todev(device_info, initfrom, 0L, addr, 0L, size);
	return addr;
}


/**
 * Frees data allocated with hm_dev_alloc
 *
 * @param devinfo      the pointer returned by hm_initialize()
 * @param imedaddr     pointer to the memory that will be released
 * @param unmap_memory used in OpenCL, when set to 1 prior to the memory
 *                     deallocation, the memory is unmapped.
 */
void hm_dev_free(void *device_info, void *imedaddr, int unmap_memory)
{
	CUresult err;
	cuda_dev_t *cuda_gpu = (cuda_dev_t *) device_info;

	DBGPRN((stderr, "%s: in\n", __FUNCTION__));

	cuda_ctx_test_init(cuda_gpu);
	cuda_stream_test_init(cuda_gpu);

#if defined(USE_CACHING)
	cuda_cache_test_init(cuda_gpu);
#endif

	if (unmap_memory)
	{
		ort_mapped_free(imedaddr);
		DBGPRN((stderr, "%s: OK (host unmap)\n", __FUNCTION__));
		return;
	}

	cuda_do(cuMemFree((CUdeviceptr) imedaddr));
	DBGPRN((stderr, "%s: OK (device)\n", __FUNCTION__));
}


/**
 * Frees a global variable allocated with hm_dev_init_alloc_global
 *
 * @param device_info  the device
 * @param iaddr        pointer to the memory that will be released
 * @param global_id    the ID of the global variable that will be released
 */
void hm_dev_free_global(void *device_info, void *iaddr, int global_id)
{
	hm_dev_free(device_info, iaddr, 0);
}


/**
 * Transfers data from the host to a device
 *
 * @param device_info the pointer returned by hm_initialize()
 * @param hostaddr    the source memory
 * @param hostoffset  offset from hostaddr
 * @param imedaddr    the target memory (internal mediary address)
 * @param devoffset   offset from imedaddr
 * @param size        the size of the memory block
 */
void hm_todev(void *device_info, void *hostaddr, size_t hostoffset,
			   void *imedaddr, size_t devoffset, size_t size)
{
	CUresult err;
	CUstream *stream;
	cuda_dev_t *cuda_gpu = (cuda_dev_t *) device_info;

	cuda_ctx_test_init(cuda_gpu);
	cuda_stream_test_init(cuda_gpu);

#if defined(USE_CACHING)
	cuda_cache_test_init(cuda_gpu);
#endif

	stream = cuda_stream_get(cuda_gpu);

	cuda_do(cuMemcpyHtoDAsync((CUdeviceptr) imedaddr + devoffset, 
				hostaddr + hostoffset, size, *stream));
	DBGPRN((stderr, "%s: bytes = %d, imedaddr = %p\n", __FUNCTION__, size, imedaddr));
	// set_put(cuda_gpu->addr_map, hostaddr)->value = (CUdeviceptr) imedaddr;
	DBGPRN((stderr, "%s: OK\n", __FUNCTION__));
}


/**
 * Transfers data from a device to the host
 *
 * @param devinfo    the pointer returned by hm_initialize()
 * @param hostaddr   the target memory
 * @param hostoffset offset from hostaddr
 * @param imedaddr   the source memory (internal mediary address)
 * @param devoffset  offset from imedaddr
 * @param size       the size of the memory block
 */
void hm_fromdev(void *device_info, void *hostaddr, size_t hostoffset,
				 void *imedaddr, size_t devoffset, size_t size)
{
	CUresult err;
	CUstream *stream;
	cuda_dev_t *cuda_gpu = (cuda_dev_t *) device_info;

	cuda_ctx_test_init(cuda_gpu);
	cuda_stream_test_init(cuda_gpu);

#if defined(USE_CACHING)
	cuda_cache_test_init(cuda_gpu);
#endif

	stream = cuda_stream_get(cuda_gpu);

	DBGPRN((stderr, "%s: in\n", __FUNCTION__));
	cuda_do(cuMemcpyDtoHAsync(hostaddr + hostoffset, 
				(CUdeviceptr) imedaddr + devoffset, size, *stream));
	cuda_stream_sync(cuda_gpu);

	DBGPRN((stderr, "%s: OK\n", __FUNCTION__));
}


/**
 * Given an internal mediary address, it returns a usable mediary address
 *
 * @param device_info  the pointer returned by hm_initialize()
 * @param imedaddr     allocated memory from hm_dev_alloc
 *
 * @return usable mediary address to pass to a kernel
 */
void *hm_imed2umed_addr(void *device_info, void *imedaddr)
{
	DBGPRN((stderr, "%s: in\n", __FUNCTION__));
	DBGPRN((stderr, "%s: addr = %p\n", __FUNCTION__, imedaddr));
	return imedaddr;
}


/**
 * Given a usable mediary address, it returns the internal mediary address
 *
 * @param device_info the device
 * @param umedaddr    allocated memory from hm_dev_alloc
 *
 * @return internal mediary address to be used by ORT
 */
void *hm_umed2imed_addr(void *device_info, void *umedaddr)
{
	DBGPRN((stderr, "%s: in\n", __FUNCTION__));
	DBGPRN((stderr, "%s: addr = %p\n", __FUNCTION__, umedaddr));
	return umedaddr;
}
