/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* ort.c -- OMPi RunTime library */

#include "ort_prive.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#include <assert.h>

#ifdef OMPI_REMOTE_OFFLOADING
	#include "remote/roff.h"
#endif

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * GLOBAL VARIABLES / DEFINITIONS / MACROS                           *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* All global variables ORT handles; if ee=proc, this is also placed
 * in shared memory.
 */
static ort_vars_t ort_globals;  /* All ORT globals stored here */
ort_vars_t        *ort;         /* Pointer to ort_globals */

static int        ort_initd = 0;
int             __ort_required  = 0;
int             __ort_mods_required  = 0;

char *ompi_remote_devices = NULL;
unsigned long ompi_remote_devices_size = 0UL;

/* Execution entity (thread/process) control block */
#ifdef USE_TLS
	TLS_KEYWORD void *myeecb;
#else
	ee_key_t eecb_key;  /* Key for ort's "thread"-specific data; the actual
	                       data is a pointer to the eecb */
#endif

/* Handy macro */
#define initialize_eecb(eecb) {\
	(eecb)->mf              = NULL;\
	(eecb)->me_master       = NULL;\
	(eecb)->parent          = NULL;\
	(eecb)->sdn             = (eecb);\
	(eecb)->num_siblings    = 1;     /* We are just 1 thread! */\
	(eecb)->thread_num      = 0;\
	(eecb)->level           = 0;     /* The only one in level 0 */\
	(eecb)->activelevel     = 0;     /* The only one in level 0 */\
	(eecb)->shared_data     = 0;\
	(eecb)->mynextNWregion  = 0;\
	(eecb)->cgid            = 0;\
	(eecb)->ee_info         = NULL;  /* *Must* init to NULL */\
	(eecb)->tg_recycler     = NULL;  /* Recycler for taskgroup*/\
	(eecb)->currplace       = -1;    /* Not placed yet */\
	(eecb)->pfrom           = 0;\
	(eecb)->pto             = numplaces(ort->place_partition);\
}

/* Process id */
#if defined(EE_TYPE_PROCESS)
	#define __MYPID  ee_pid()
#else
	#define __MYPID  0
#endif

#if (ORT_DEBUG & DBG_ORT)
	double threading_time, modules_time, env_time, ortinit_time;
#endif


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * INITIALIZATION / SHUTDOWN                                         *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


typedef struct initlist_s {
	void (*func)(void);
	struct initlist_s *next;
} initlist_t;
static initlist_t *ort_initreqs;   /* The list of auto-initializers */


/* Call all initreqs functions */
static void initreqs_do()
{
	initlist_t *req;

	for (req = ort_initreqs; req != NULL; )
	{
		(req->func)();
		req = (ort_initreqs = req)->next;
		ort_free(ort_initreqs);
	}
	ort_initreqs = NULL;
}


/* Add a function that will be called by ort_init during startup.
 * It is guaranteed by the parser that this is called *before* main() starts.
 */
void _ort_initreqs_add(void (*initfunc)(void))
{
	initlist_t *req = (initlist_t *) ort_calloc(sizeof(initlist_t));
	req->func = initfunc;
	req->next = ort_initreqs;
	ort_initreqs = req;
}


/*
 * Last function called (just before exit).
 * "exitval" is what the original (user) main() returned.
 */
void _ort_finalize(int exitval)
{
	if (!__ort_required) return;
	
	ort_finalize_devices();
#ifdef EE_CLUSTERIZED
	if (!__MYPID)
	{
		makecontext(&private_stack_uc, (void *)ee_finalize, 1, exitval);
		swapcontext(&shared_stack_uc, &private_stack_uc);
	}
#endif

	ee_finalize(exitval);
#ifdef EE_TYPE_PROCESS
	thrtask_funcs_cleanup();
#endif
	fflush(stderr);       /* Because _exit may not flush stdio buffers */
	fflush(stdout);
	_exit(exitval);             /* Make sure nothing else is called */
}


/* Called only if the user calls exit() */
void _at_exit_call(void)
{
	ee_finalize(-1);
}

#ifdef EE_TYPE_PROCESS
static void share_globals(void);
#endif


static void initialize_threading(void)
{
	ort_eecb_t *initial_eecb;
	void league_initial();

	if (ort->icvs.nthreads > 0) ort->icvs.nthreads--;  /* 1- for eelib */
	if (ee_initialize(ort->argc, ort->argv, &ort->icvs, &ort->eecaps) != 0)
		ort_error(1, "cannot initialize the thread library.\n");

	/* Check for conformance to user requirements */
	if (ort->icvs.nthreads == -1)  /* Let the eelib set the default */
		ort->icvs.nthreads = ort->eecaps.default_numthreads + 1;
	else                          /* user asked explicitely */
	{
		if (ort->eecaps.max_threads_supported > -1 &&
		    ort->icvs.nthreads < ort->eecaps.max_threads_supported)
			if (!ort->icvs.dynamic || !ort->eecaps.supports_dynamic)
				ort_error(1, "the library cannot support the requested number (%d) "
				          "of threads.\n", ort->icvs.nthreads + 1);
		ort->icvs.nthreads++;        /* Restore value */
	}
	/* Fix discrepancies */
	if (ort->icvs.dynamic && !ort->eecaps.supports_dynamic)
		ort->icvs.dynamic = 0;
	if (ort->icvs.nested  && !ort->eecaps.supports_nested)
		ort->icvs.nested  = 0;
	check_nested_dynamic(ort->icvs.nested, ort->icvs.dynamic); /* is eelib ok? */

	/* OpenMP 3.0 stuff */
	if (ort->eecaps.max_levels_supported != -1 &&
	    ort->eecaps.max_levels_supported < ort->icvs.levellimit)
		ort->icvs.levellimit = ort->eecaps.max_levels_supported;

	if (ort->eecaps.max_threads_supported != -1 &&
	    ort->eecaps.max_threads_supported < ort->icvs.threadlimit)
		ort->icvs.threadlimit = ort->eecaps.max_threads_supported;

	/* Initialize the 3 locks we need */
	ee_init_lock((ee_lock_t *) &ort->atomic_lock, ORT_LOCK_SPIN);
	ee_init_lock((ee_lock_t *) &ort->preparation_lock, ORT_LOCK_NORMAL);
	ee_init_lock((ee_lock_t *) &ort->eecb_rec_lock, ORT_LOCK_NORMAL);
	ee_init_lock((ee_lock_t *) &ort->host_kernel_lock, ORT_LOCK_NORMAL);

	/* Recycle bin of eecbs is empty */
	ort->eecb_recycler = NULL;

	ort->thrpriv_num = 0;

	/* The initial thread */
	initial_eecb = eecb_initial_prepare(NULL);

	/* If binding is enabled, bind initial thread to the first place of list */
	if (ort->icvs.proc_bind != omp_proc_bind_false && 
	    ort->eecaps.supports_proc_binding)
		initial_eecb->currplace = ee_bindme(ort->place_partition, 0);
	/* The place partition of the initial thread is the whole place list */
	initial_eecb->pfrom = 0;
	initial_eecb->pto   = numplaces(ort->place_partition) - 1;

#ifdef USE_TLS
#else
	ee_key_create(&eecb_key, 0);  /* This key stores a pointer to the eecb */
#endif
	__SETMYCB(initial_eecb);
	tasking_init();
	ort->initial_eecb = initial_eecb;

	league_initial();             /* This is the very initial league */
	atexit(_at_exit_call); /* Upon exit .. */
}


/* This is the ORT initialization part that comes after modules discovery.
 * It is a seperate function because some esoteric modules may block during
 * their discovery/initialization; just before blocking they have a chance
 * to complete ORT initialization by calling this function explicitly.
 */
void ort_init_after_modules()
{
#if ORT_DEBUG & DBG_ORT
	double t0 = omp_get_wtime();
#endif
	
	/* The following need to know the number of discovered devices */
	if (__ort_mods_required)
		target_decltarg_initialize();  /* Initialize declared variables structure */

	env_get_environment();                       /* Get environmental variables */

#if ORT_DEBUG & DBG_ORT
	env_time = omp_get_wtime() - t0;
#endif

	/* Initialize OMPi's taskqueuesize, if requested */
	if (ort->dynamic_taskqueuesize)
		ort->taskqueuesize = 3 * (ort->icvs.ncpus);

	/* Initialize the execution entities */
	initialize_threading();

#if ORT_DEBUG & DBG_ORT
	threading_time = omp_get_wtime() - env_time - t0;
#endif

#if defined(EE_TYPE_PROCESS)
	#if defined(EE_CLUSTERIZED)
		//MPI_Bcast((void *)ort, sizeof(ort_vars_t), MPI_CHAR, 0, OMPI_WORLD);
	#endif
	share_globals();
#endif

	initreqs_do();
}


#if defined(EE_CLUSTERIZED)

int _ort_init(int *argc, char ***argv, int embedmode, int nmodules, ...)
{
	#define _ort_init _ort_onprvstack_initialize
	int _ort_onprvstack_initialize(int *argc, char ***argv, int embedmode, int nmodules, ...);

#if ORT_DEBUG & DBG_ORT
	printf("[clompi] process %d initializing ORT\n", __MYPID);
#endif

	if (!__MYPID)
	{
		makecontext(&private_stack_uc, (void *)_ort_onprvstack_initialize, 4,
				argc, argv, 0, 0);
		swapcontext(&shared_stack_uc, &private_stack_uc);
	}
	else
		_ort_onprvstack_initialize(argc, argv, 0, 0);
}

#endif


static char **valist_to_array(int size, va_list ap)
{
	char **arr;
	int i;
	
	arr = ort_alloc(size * sizeof(char*));
	for (i = 0; i < size; i++)
		arr[i] = va_arg(ap, char *);
	
	return arr;
}


/*
 * First function called.
 * The embedmode flag was added for special cases, e.g. when ORT is "embedded"
 * within another ORT (e.g. in the proc device module). There is nothing
 * special about it though. Basically used for suppressing multiple info
 * message printouts.
 */
int _ort_init(int *argc, char ***argv, int embedmode, int nmodules, ...)
{
	va_list ap;
	char **modnames;

#if ORT_DEBUG & DBG_ORT
	double t0 = omp_get_wtime(), t1;
#endif

	if ((!embedmode) && ((!__ort_required) || (ort_initd))) return 0;

	/* Some defaults */
	ort = &ort_globals;
	ort->icvs.ncpus           = sdeps_get_num_procs(); /* Default ICV values */
	ort->icvs.stacksize       = -1;                  /* OpenMP 3.0 */
	ort->icvs.threadlimit     = 1 << 30;             /* (unlimited) OpenMP 3.0 */
	ort->icvs.levellimit      = 1 << 30;             /* (unlimited) OpenMP 3.0 */
	ort->icvs.waitpolicy      = _OMP_ACTIVE;         /* OpenMP 3.0 */
	ort->icvs.nthreads        = -1;        /* per-task; no preference for now */
	ort->icvs.rtschedule      = omp_sched_auto;
	ort->icvs.rtchunk         = 0;
	ort->icvs.dynamic         = 1;
	ort->icvs.nested          = 0;
	ort->icvs.proc_bind       = omp_proc_bind_false; /* OpenMP 4.0 */
	ort->icvs.cancel          = 0;                   /* OpenMP 4.0 */
	/* Default = the first non-host device or the host if it is the only device */
	ort->icvs.def_ompdevid    = 0;                   /* OpenMP 4.0 & 5.1 */
	ort->place_partition      = NULL;                /* OpenMP 4.0 */
	/* Call of places_get_default() was moved to env_get_environment() in
	 * order to avoid a redundant (second) topology detection.
	 */
	ort->num_devices          = 0;                   /* OpenMP 4.0 */
	ort->icvs.max_task_prio   = 0;                   /* OpenMP 4.5 */
	ort->icvs.display_affinity = 0;                  /* OpenMP 5.0 */
	ort->icvs.affinity_format = ort_get_default_affinity_format();/* OpenMP 5.0 */
	ort->icvs.targetoffload   = OFFLOAD_DEFAULT;     /* OpenMP 5.0 */
	ort->icvs.nteams          = 0;                   /* OpenMP 5.1 */
	ort->icvs.teams_thread_limit = 0;                /* OpenMP 5.1 */
	ort->hostdevs             = NULL;
	ort->module_host.unified_medaddr = 1;
	ort->embedmode            = embedmode;
	ort->argc                 = argc;
	ort->argv                 = argv;

#if ORT_DEBUG & DBG_ORT
	t1 = omp_get_wtime();
#endif

#ifdef OMPI_REMOTE_OFFLOADING
	if (!embedmode)
		roff_man_prepare_node(argc, argv);
#endif
	
	if (__ort_mods_required)
	{
		va_start(ap, nmodules);

#ifdef OMPI_REMOTE_OFFLOADING
		if (node_role == ROLE_WORKER)
			modnames = roff_worker_get_modules(&nmodules);
		else
#endif
		modnames = valist_to_array(nmodules, ap);

		/* Initialize modules and device structures */
		ort_discover_modules(nmodules, modnames); /* Host is always added as dev 0 */
		va_end(ap);

#ifdef OMPI_REMOTE_OFFLOADING
		if (node_role == ROLE_PRIMARY)
#endif
		ort_free(modnames);
	}
	
#if ORT_DEBUG & DBG_ORT
	modules_time = omp_get_wtime() - t1;
#endif
	
	/* Initializations that come after module discovery */
	ort_init_after_modules();
	if (ort->icvs.targetoffload == OFFLOAD_DISABLED)
		ort->num_devices = 0;     /* ditch all devices */
	
	if (!embedmode)            
		env_display_vars(-1);                   /* If asked, show environment */

#if defined(EE_CLUSTERIZED)
  if (!__MYPID)
	  swapcontext(&private_stack_uc, &shared_stack_uc);
#endif

#if ORT_DEBUG & DBG_ORT
	ortinit_time = omp_get_wtime() - t0;
	printf("[ ort_init @ %.4lfsec\n"
	       "         modules @ %.4lfsec\n"
	       "     environment @ %.4lfsec\n"
	       "       threading @ %.4lfsec ]\n", 
	       ortinit_time, modules_time, env_time, threading_time);
	ort_debug_thread("<this is the master thread>");
#endif
	
#ifdef OMPI_REMOTE_OFFLOADING
	if (!embedmode && node_role == ROLE_WORKER)
		roff_worker_loop();
#endif
	
	return (ort_initd = 1);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * SHARED MEMORY FOR THE PROCESS MODEL                               *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


#if defined(EE_TYPE_PROCESS)

static ort_sglvar_t *ort_sglvar_list;   /* The list of requests */
static void         *ort_sglvar_area;   /* The start of the alloted memory */
static int          shared_data_id;


static void share_globals(void)
{
	ort_sglvar_t *req;
	char         *mem;
	int          rsize;
	int          list_size = ((ort_sglvar_list!=NULL) ? ort_sglvar_list->size:0);

	/* Memory layout :
	 * a) shared globals (sgl), b) ort_globals, c) master_eecb, d) master_task
	 */
	ort_shmalloc(&ort_sglvar_area, list_size +
	               sizeof(ort_globals)+sizeof(ort_eecb_t)+sizeof(ort_task_node_t),
	             &shared_data_id);
	/* FIXME: why only CLUSTERIZED? Normal processes should do the same? */
#ifdef EE_CLUSTERIZED
	if (!ee_pid())  /* only one node needs to do the actual copying */
#endif
	{
		memcpy(ort_sglvar_area + list_size, ort, sizeof(ort_globals));
		memcpy(ort_sglvar_area + list_size + sizeof(ort_globals), __MYCB,
		       sizeof(ort_eecb_t));
		memcpy(ort_sglvar_area + list_size + sizeof(ort_globals) + sizeof(ort_eecb_t),
		       __CURRTASK(__MYCB), sizeof(ort_task_node_t));
	}
	ort = (ort_vars_t *)(((char *) ort_sglvar_area) + list_size);
	ort->initial_eecb = __SETMYCB(ort_sglvar_area + list_size + sizeof(ort_globals));
	__SETCURRTASK(__MYCB, ort_sglvar_area + list_size + sizeof(
	                ort_globals) + sizeof(ort_eecb_t));
	__MYCB->sdn = ort_sglvar_area + list_size + sizeof(ort_globals);

#ifdef EE_CLUSTERIZED
	#if ORT_DEBUG & DBG_ORT
		printf("[clompi]: sharing gloabls; process %d -> eecb=%p\n",__MYPID,__MYCB);
	#endif
#endif

	for (mem = ort_sglvar_area, req = ort_sglvar_list; req != NULL;)
	{
		rsize = (req->next != NULL) ? req->size - req->next->size
		        : req->size;
		*(req->varptr) = (void *) mem;
		/* FIXME: why only CLUSTERIZED? Normal processes should do the same? */
#ifdef EE_CLUSTERIZED
		if (!ee_pid())  /* only one node needs to do the actual copying */
#endif
		{
			if (req->initvalue)
				memcpy(mem, req->initvalue, rsize);
			else
				memset(mem, 0, rsize);
		}

		mem += rsize;

		req = (ort_sglvar_list = req)->next;
		ort_free(ort_sglvar_list);
	}
}


/* Should mark all allocation requests; it is guaranteed by the parser
 * that this is called *before* main() starts. Thus, it should only file
 * the requests and do the actual allocations later, when _ort_init()
 * is called (*dataptr should then be made to point to an allocated
 * space of size bytes).
 */
void _ort_sglvar_allocate(void **varptr, int size, void *initer)
{
	ort_sglvar_t *req = (ort_sglvar_t *) ort_calloc(sizeof(ort_sglvar_t));

	req->varptr     = varptr;
	req->size       = ort_sglvar_list ? size + ort_sglvar_list->size : size;
	req->initvalue  = initer;
	req->next       = ort_sglvar_list;
	ort_sglvar_list = req;
}


/* Thread/task function table */
static ttkfunctable_t allthrfuncs, alltaskfuncs;


/* Called by compiler-generated code */
void _ort_register_thrfunc(ttkfunc_t func)
{
	ttkfunctable_add(&allthrfuncs, func, NULL);
}

int thrfunc_get_id(ttkfunc_t func)
{
	return ttkfunctable_findbyptr(&allthrfuncs, func);
}

ttkfunc_t thrfunc_get_ptr(int fid)
{
	return ttkfunctable_getptr(&allthrfuncs, fid);
}

/* Called by compiler-generated code */
void _ort_register_taskfunc(ttkfunc_t func)
{
	ttkfunctable_add(&alltaskfuncs, func, NULL);
}

int taskfunc_get_id(ttkfunc_t func)
{
	return ttkfunctable_findbyptr(&alltaskfuncs, func);
}

ttkfunc_t taskfunc_get_ptr(int fid)
{
	return ttkfunctable_getptr(&alltaskfuncs, fid);
}

void thrtask_funcs_cleanup()
{
	if (allthrfuncs.table)
		free(allthrfuncs.table);
	if (alltaskfuncs.table)
		free(alltaskfuncs.table);
}

#endif


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * ATOMIC                                                            *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


/* The binding thread set of an atomic region is the set of all threads
 * in the program, not just the threads of the current team (OpenMP V2.5).
 * Thus ort->atomic_lock is global.
 */
void _ort_atomic_begin()
{
	ee_set_lock((ee_lock_t *) &ort->atomic_lock);    /* ##### */
}


void _ort_atomic_end()
{
	ee_unset_lock((ee_lock_t *) &ort->atomic_lock);    /* ##### */
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * EECBs AND INTEROPERABILITY                                        *
 * (an external thread may become an OpenMP master.                  *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


 static 
 ort_task_node_t *prepare_initial_implicit_task(ort_task_node_t *t)
 {
	 if (t == NULL)
		 t = (ort_task_node_t *) ort_calloc(sizeof(ort_task_node_t));
 
	 t->rtid              = -1;
	 t->icvs.dynamic      = ort->icvs.dynamic;
	 t->icvs.nested       = ort->icvs.nested;
	 t->icvs.rtschedule   = ort->icvs.rtschedule;
	 t->icvs.rtchunk      = ort->icvs.rtchunk;
	 t->icvs.nthreads     = ort->icvs.nthreads;
	 /* OpenMP 4.0 */
	 t->icvs.def_ompdevid = ort->icvs.def_ompdevid;
	 t->icvs.threadlimit  = ort->icvs.threadlimit;
	 t->icvs.proc_bind    = ort->icvs.proc_bind;
	 t->taskgroup         = NULL;
	 t->taskscope         = NULL;
	 t->icvs.cur_de       = NULL;
	 /* OMPi extensions */
	 t->icvs.xsched      = ort->icvs.xsched;
 
	 return (t);
 }
 
 
 /** 
	* Allocates and initializes an eecb for becoming an initial threads
	* @param cb  An already existing eecb to be re-initialized or NULL to 
	*            allocate a new eecb. 
	* @return    cb if it was non-NULL, or the new eecb otherwise
  */
 ort_eecb_t *eecb_initial_prepare(ort_eecb_t *cb)
 {
	 ort_mcbf_t *mf = NULL;  /* To backup cb->mf if needed */
	 ort_eecb_t *mm = NULL;  /* To backup cb->me_master if needed */
	 ort_task_node_t *implicit_task;
 
	 if (cb == NULL)
		 cb = (ort_eecb_t *) ort_calloc_aligned(sizeof(ort_eecb_t), NULL);
	 else
		 if (cb->mf != NULL)  /* We should not re-allocate those 2 fields */
		 {
			 mf = cb->mf;       
			 mm = cb->me_master;
		 }
 
	 initialize_eecb(cb);
	 /* Prepare for possibly becoming a team master */
 #ifdef EE_TYPE_PROCESS
	 if (mf)
	 {
		 cb->mf = mf;
		 cb->me_master = mm;
	 }
	 else
	 {
		 int tpk_memid;
 
		 ee_shmalloc((void *)&(cb->mf), sizeof(ort_mcbf_t), &tpk_memid);
		 cb->me_master = (ort_eecb_t *) ort_calloc_aligned(sizeof(ort_eecb_t), NULL);
		 cb->mf->tpksize = 128;   /* Enough tpkeys to minimize chances of reallocs */
		 ee_shmalloc((void *)&(cb->mf->tpkeys),
			 cb->mf->tpksize * sizeof(ort_tptable_t), &tpk_memid);
	 }
	 if (__MYPID == 0)
	 {
		 cb->mf->barrier = NULL;
		 ee_init_lock(&cb->mf->copyprivate.lock, ORT_LOCK_SPIN);
		 cb->mf->workshare.blocking.inited = 0;
	 }
 #else
	 if (mf)
	 {
		 cb->mf = mf;
		 cb->me_master = mm;
	 }
	 else
	 {
		 cb->mf = mf ? mf : (ort_mcbf_t *) mcbf_alloc();
		 cb->me_master = (ort_eecb_t *) ort_calloc_aligned(sizeof(ort_eecb_t), NULL);
		 /* At least 1 row is needed for initial thread's threadprivate vars */
		 cb->mf->tpkeys = ort_calloc(ort->icvs.ncpus*sizeof(ort_tptable_t));
		 cb->mf->tpksize = ort->icvs.ncpus;
	 }
 #endif
 
	 /* The initial threads's impicit task */
	 implicit_task = prepare_initial_implicit_task(NULL);
	 __SETCURRTASK(cb, implicit_task);
	 __SETCURRIMPLTASK(cb, implicit_task);
	 return (cb);
 }
 
 
 void free_initial_eecb(ort_eecb_t *cb)
 {
 #ifdef EE_TYPE_PROCESS
	 ee_shmfree(cb->mf->tpkeys);
	 ee_shmfree(cb->mf);
 #else
	 ort_free(cb->mf->tpkeys);
	 ort_free(cb->mf);    
 #endif
	 ort_free(__CURRIMPLTASK(cb));
	 ort_free(cb->me_master); 
	 ort_free(cb); 
 }
 
 
 /*
  * eecb_alloc_temp()/eecb_free_temp() deal with temporary EECBs that use 
	* a recycle bin and and are only called from parallel.c to implement the
	* threads-to-tasks optimization we do on nested workshares.
  */
void *eecb_alloc_temp(void)
{
	ort_eecb_t *eecb;

	ee_set_lock((ee_lock_t *) &ort->eecb_rec_lock);

	if (ort->eecb_recycler == NULL) /* If recycle bin is empty */
		eecb = ort_calloc_aligned(sizeof(ort_eecb_t), NULL); /* Allocate new eecb */
	else
		ort->eecb_recycler = (eecb = ort->eecb_recycler)->parent;

	ee_unset_lock((ee_lock_t *) &ort->eecb_rec_lock);
	return eecb;
}


void eecb_free_temp(ort_eecb_t *eecb)
{
#if 0
	/* If you want to free eecb->mf stuff, do not free eecb->mf itself
	 * since it does not hold the actual allocation address (alloc_aligned)
	 */
	if (eecb->mf != NULL)
	{
		ee_barrier_destroy(&eecb->mf->barrier);
		reds_finalize(eecb);
	}
#endif
	
	ee_set_lock((ee_lock_t *) &ort->eecb_rec_lock);

	eecb->parent = ort->eecb_recycler;
	ort->eecb_recycler = eecb;

	ee_unset_lock((ee_lock_t *) &ort->eecb_rec_lock);
}


/* Allocate, initialize and return an eecb structure
 * (psthreads/task optimization)
 */
void *alloc_init_eecb(ort_eecb_t *eecb, int thrid, void *parent_info)
{
	ort_eecb_t *parent = (ort_eecb_t *) parent_info;

	if (eecb == NULL)
		eecb = ort_calloc_aligned(sizeof(ort_eecb_t), NULL);

	eecb->parent         = parent;
	eecb->sdn            = parent;
	eecb->num_siblings   = parent->mf->num_children;
	eecb->thread_num     = thrid;               /* Thread id within the team */
	eecb->level          = parent->level + 1       ;       /* 1 level deeper */
	eecb->activelevel    = parent->activelevel +  /* OpenMP 3 - team of 1 is */
	                        ((eecb->num_siblings > 1) ? 1 : 0); /* NOT parallel */
	eecb->shared_data    = NULL;
	eecb->mynextNWregion = 0;
	eecb->ee_info        = NULL;        /* not needed actually due to calloc */

	return (void *) eecb;
}


void *mcbf_alloc(void)
{
	ort_mcbf_t *mf = (ort_mcbf_t *) ort_calloc_aligned(sizeof(ort_mcbf_t), NULL);
	ee_init_lock(&mf->copyprivate.lock, ORT_LOCK_SPIN);
	mf->workshare.blocking.inited = 0; // not required due to calloc
	return (void *) mf;
}


/* Any user thread which is not an openmp thread may become an independent
 * OpenMP master, i.e. an "initial" thread.
 * Repeated calls just reset the eecb (but not mf/me_master/tpkeys!)
 */
int ompi_makeme_openmp_master()
{
	ort_eecb_t *me = __MYCB;

	if (me && me->level)
		return (-1);    /* Error; called from a non-master openmp thread */
	me = eecb_initial_prepare(me);
	__SETMYCB(me);
	return (0);
}


/* This function is called in case of target if(false).
 * A new OpenMP thread is set up (new eecb and task node acquired).
 * ICVs are initialized from the initial ones.
 * Finally the new "thread" executes the kernel func code.
 * 
 * TODO: Do we need this stuff? Can't we just call the kernel func?
 */
void execute_kernel_on_host(void *(*func)(void *), void *shared, int num_teams,
                            int thread_limit)
{
	ort_eecb_t *curr_eecb, *temp_eecb = eecb_initial_prepare(NULL);
	int thrlim_bak = ort->icvs.teams_thread_limit,
	    nteams_bak = ort->icvs.nteams;
	
	curr_eecb = __MYCB;    /* Backup current eecb */
	__SETMYCB(temp_eecb);  /* Get new eecb */
	
	/* Locking (if not initial thread) needed to Change device ICVs */
	if (curr_eecb != ort->initial_eecb)
		ee_set_lock((ee_lock_t *) &ort->host_kernel_lock);
	
	if (thread_limit > 0)
	{
		__CURRIMPLTASK(temp_eecb)->icvs.threadlimit = thread_limit;
		ort->icvs.teams_thread_limit = thread_limit;
	}
	if (num_teams > 0)
		ort->icvs.nteams = num_teams;
	
	(*func)(shared);       /* Execute kernel func. */
	
	if (thread_limit > 0)
		ort->icvs.teams_thread_limit = thrlim_bak;
	if (num_teams > 0)
		ort->icvs.nteams = nteams_bak;
	
	if (curr_eecb != ort->initial_eecb)
		ee_unset_lock((ee_lock_t *) &ort->host_kernel_lock);
	
	__SETMYCB(curr_eecb);  /* Restore previous eecb */
	free_initial_eecb(temp_eecb);
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                   *
 * UTILITY FUNCTIONS                                                 *
 *                                                                   *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


void _ort_handle_error(int fatal, char *usermsg)
{
	if (fatal)
		ort_error(1,"%s\n",usermsg?usermsg : "an #error directive was executed.");
	else
		ort_warning("%s\n",usermsg?usermsg : "an #error directive was executed.");
}


void ort_error(int exitcode, char *format, ...)
{
	va_list ap;

	va_start(ap, format);
	fprintf(stderr, "[ORT error]: ");
	vfprintf(stderr, format, ap);
	va_end(ap);

	exit(exitcode);
}


void ort_warning(char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	fprintf(stderr, "[ORT warning]: ");
	vfprintf(stderr, fmt, ap);
	va_end(ap);
}


void *ort_alloc(int size)
{
	void *a;

	if ((a = malloc(size)) == NULL)
		ort_error(1, "[ort_alloc]: memory allocation failed for %d bytes\n", size);
	return (a);
}


/* This should allocate space globally (e.g. in a shared memory region); it
 * is basically useless:
 * a) only used by ort_prepare_omp_lock()
 * b) only used (shadowed) by the proc module
 * c) only works for GCC
 * It is only defined so that it can be shadowed by proc module's version.
 */
#ifdef __GNUC__
__attribute__ ((weak)) void *ort_alloc_global(int size);
#endif
void *ort_alloc_global(int size)
{
	return ( ort_alloc(size) );
}


void *ort_calloc(int size)
{
	void *a;

	if ((a = calloc(1, size)) == NULL)
		ort_error(1, "[ort_calloc]: memory allocation failed\n");
	return (a);
}


void *ort_realloc(void *original, int size)
{
	void *a;

	if ((a = realloc(original, size)) == NULL)
		ort_error(1, "[ort_realloc]: memory reallocation failed\n");
	return (a);
}


void *ort_alloc_aligned(int size, void **actual)
{
#ifdef HAVE_POSIX_MEMALIGN
	void *tmp;

	if (posix_memalign(&tmp, CACHE_LINE, size))
		ort_error(1, "[ort_alloc_aligned]: posix_memalign failed\n");
	if (actual)
		*actual = tmp;
	return (tmp);
#else
	if (actual == NULL)
	{
		void *tmp = ort_alloc(size + CACHE_LINE - 1);
		return ((void *)((((ptrint)(tmp)) + CACHE_LINE - 1) &
		                 ((ptrint)(-CACHE_LINE))));
	}
	*actual = ort_alloc(size + CACHE_LINE - 1);
	return ((void *)((((ptrint)(*actual)) + CACHE_LINE - 1) &
	                 ((ptrint)(-CACHE_LINE))));
#endif
}


void *ort_calloc_aligned(int size, void **actual)
{
#ifdef HAVE_POSIX_MEMALIGN
	void *tmp;
	
	if (posix_memalign(&tmp, CACHE_LINE, size))
		ort_error(1, "[ort_calloc_aligned]: posix_memalign failed\n");
	memset(tmp, 0, size);
	if (actual)
		*actual = tmp;
	return (tmp);
#else
	if (actual == NULL)
	{
		void *tmp = ort_calloc(size + CACHE_LINE - 1);
		return ((void *)((((ptrint)(tmp)) + CACHE_LINE - 1) &
		                 ((ptrint)(-CACHE_LINE))));
	}
	*actual = ort_calloc(size + CACHE_LINE - 1);
	return ((void *)((((ptrint)(*actual)) + CACHE_LINE - 1) &
	                 ((ptrint)(-CACHE_LINE))));
#endif
}


void *ort_realloc_aligned(int oldsize, int newsize, void **actual)
{
#ifdef HAVE_POSIX_MEMALIGN
	void *tmp;
	
	if (posix_memalign(&tmp, CACHE_LINE, newsize))
		ort_error(1, "[ort_realloc_aligned]: posix_memalign failed\n");
	if (oldsize > 0)
		memcpy(tmp, *actual, (newsize > oldsize) ? oldsize : newsize);
	*actual = tmp;
	return (tmp);
#else
	if ((*actual = realloc(*actual, newsize + CACHE_LINE - 1)) == NULL)
		ort_error(1, "memory reallocation failed\n");
	return ((void *)((((ptrint)(*actual)) + CACHE_LINE - 1) &
	                 ((ptrint)(-CACHE_LINE))));
#endif
}


void ort_free(void *ptr)
{
#ifndef HAVE_POSIX_MEMALIGN
	/* If the memory area is aligned, then do not free it to be on the safe side.
	 * Maybe one day we provide an ort_free() call than acceptes the real 
	 * address, not the aligned one 
	 */
	if ((ptr%CACHE_LINE) == 0) /* Possible mem leak but better safe than sorry */
		return;
#endif
	free(ptr);
}


#if defined(EE_TYPE_PROCESS)
void ort_shmalloc(void **p, int size, int *upd)
{
	ee_shmalloc(p, size, upd);

	if (!(*p))
		ort_error(1, "ort_shmalloc failed\n");
}

void ort_shmfree(void *p)
{
	ee_shmfree(p);
}
#endif


/* The next two functions are for userland calls */
void *_ort_memalloc(int size)
{
	return ort_alloc(size);
}


void _ort_memfree(void *ptr)
{
	ort_free(ptr);
}


/* This is only called from parser-generate code. */
void _ort_fence(void)
{
	FENCE;
}


/* User-program (omp) locks are all defined as void *, including
 * the parser-generated locks for critical and reduction directives.
 * Upon initialization of such a lock, an actual othr lock is
 * allocated and initialized, through the following function.
 */


/* Allocate & initialize a user lock safely
 */
void ort_prepare_omp_lock(omp_lock_t *lock, int type)
{
	ee_set_lock((ee_lock_t *) &ort->preparation_lock);

	if (*lock == NULL)
	{
		/* The problem we have here is with non-global user locks.
		 * Those are not initialized necessarily to NULL, thus
		 * we cannot know if a lock is already initialized or
		 * not. We must assume it is not initialized. Otherwise,
		 * if many threads try to initialize it, we may end up
		 * with dangling malloc()s. Of course, the programmer
		 * who lets each thread initialize the same user lock,
		 * is a bad programmer.
		 */
#if defined(EE_TYPE_PROCESS)
		*lock = ee_init_lock((ee_lock_t *) - 1, type);
		SFENCE;
#else
		void *new = ort_alloc_global(sizeof(ee_lock_t));
		ee_init_lock((ee_lock_t *) new, type);
		SFENCE; /* 100% initialized, before been assigned to "lock" */
		*lock = new;
#endif
	}
	ee_unset_lock((ee_lock_t *) &ort->preparation_lock);
}


/* For internal tests only */
void ort_debug_thread(char *fmt, ...)
{
	va_list ap;
	static ee_lock_t *l;
	ort_eecb_t *t = __MYCB;

#define indent() { int i; for (i = 0; i < t->level; i++) fputs("   ", stderr); }
	if (l == NULL)
	{ l = malloc(sizeof(ee_lock_t)); ee_init_lock(l, ORT_LOCK_NORMAL); }

	ee_set_lock(l);

	if (t == NULL)
	{
		va_start(ap, fmt);
		if (fmt)
		{
			fprintf(stderr, "(  *** uninitialized thread ***\n");
			fprintf(stderr, "  MESSAGE:\n");
			fprintf(stderr, "    "); vfprintf(stderr, fmt, ap);
			fprintf(stderr, "\n)\n");
		}
		va_end(ap);
		ee_unset_lock(l);
		return;
	}
	indent(); fprintf(stderr, "( ::%ld::\n", (long int) t);
	indent(); fprintf(stderr, "   |       id = %d\n", t->thread_num);
	indent(); fprintf(stderr, "   |    level = %d\n", t->level);
	indent(); fprintf(stderr, "   | teamsize = %d\n", t->num_siblings);
	indent(); fprintf(stderr, "   |  ee_info = %ld\n", (long int) t->ee_info);
	indent(); fprintf(stderr, "   |   parent = %ld\n", (long int) t->parent);

	va_start(ap, fmt);
	if (fmt)
	{
		fprintf(stderr, "\n");
		indent(); fprintf(stderr, "  MESSAGE:\n");
		indent(); fprintf(stderr, "    "); vfprintf(stderr, fmt, ap);
		indent(); fprintf(stderr, "\n");
	}
	va_end(ap);

	indent(); fprintf(stderr, ")\n");
	ee_unset_lock(l);
#undef indent
}

/* 
 * Marks the OMPi runtime as "required", i.e. 
 * to be fully initialized 
 */
void _ort_required(void)
{
	__ort_required = 1;
}

/* 
 * Marks the OMPi module functionality as "required", i.e. 
 * to be fully initialized 
 */
void _ort_modules_required(void)
{
	__ort_mods_required = 1;
}

#ifdef OMPI_REMOTE_OFFLOADING

void _ort_set_ompi_remote_devices(char *ompi_remote_devices_snapshot,
                                unsigned long ompi_remote_devices_snapshot_size)
{
	ompi_remote_devices = ompi_remote_devices_snapshot;
	ompi_remote_devices_size = ompi_remote_devices_snapshot_size;
}

#endif
