/*
  OMPi OpenMP Compiler
  == Copyright since 2001 the OMPi Team
  == Dept. of Computer Science & Engineering, University of Ioannina

  This file is part of OMPi.

  OMPi is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  OMPi is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OMPi; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/

/* ee_process/oprc.c
 * OpenMP threading based on processes and SysV shared memory
 */

/* TODO LIST:
 * 1) Fix copyin(); threadprivate variables are not handled properly by 
 *    the compiler (they cannot be copied from the master since they are not
 *    placed in shared memory like other globals; and they cannot be placed
 *    in shared memory since they are private). The best thing would be only
 *    the master's var to be in shared memory but this would require the 
 *    compiler to produce different codes for the master and the workers.
 *    Actually, one idea would be to have one extra copy in shared memory
 *    and the master to upload the var's value, while the others download
 *    it. Hmmmmm...
 * 2) Fix the default barrier (??)
 * 3) Handle copyprivate() in single constructs
 * 4) ordered is broken because the r->forloop.curriter array is 
 *    allocated dynamically after the team is created.
 */

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/wait.h>
#include "../ort.h"
#include "ee.h"

#define RUNNER_STACKSIZE 8*1024*1024   /* Have an 8MB stack (for the runner) */
#define MAXSHMLOCKS 100                /* Max number of (preallocated) locks */

oprc_lock_t *slock = NULL;             /* Array of user-defined locks */
int slock_id = -1;                     /* Memory ID of the shared lock */
int *lock_counter = NULL;              /* Counter of used locks */
int lock_counter_id = -1;              /* Memory ID of the lock counter */

static int eelib_inited = 0;           /* Flag to avoid re-initializations */

/* Global stuff for the team */
int  process_num;
static void *team_argument;            /* The argument each thread must init */

pid_t *p;
volatile int shutdown = 0;             /* TRUE notifies that we are done */

pthread_mutex_t waitforparallel_lock, paralleldone_lock;
pthread_cond_t waitforparallel_cv, paralleldone_cv;

int   __myrank;     /* Sequential id within the team */
pid_t __mypid;      /* My process id */


/* 
 * SHMEM MANAGEMENT: need to remember so as to free up at the end.
 */

#define MAXSHMREQ 1024
static struct _memreqs {
	int memid;
	void *mem;
} memreqs[MAXSHMREQ];  /* Remember allocations so as to free them up */
static int nmreq;

void add_alloted(int memid, void *mem)
{
	if (nmreq == MAXSHMREQ)
		fprintf(stderr, "[ee_process]: reached max shmem requests; use ipcrm...\n");
	else
	{
		memreqs[nmreq].memid = memid;
		memreqs[nmreq++].mem = mem;
	}
}

int get_alloted_from_memid(int memid)
{
	int i;

	for (i = 0; i < nmreq; i++)
		if (memreqs[i].memid == memid)  /* Found */
			return i;
	return -1;
}

int get_alloted_from_ptr(void *mem)
{
	int i;

	for (i = 0; i < nmreq; i++)
		if (memreqs[i].mem == mem)  /* Found */
			return i;
	return -1;
}

void del_alloted(int idx)
{
	if (idx >= 0 && idx < nmreq-1)          /* Shift */
		memmove(memreqs+idx, memreqs+idx+1, (nmreq-1-idx)*sizeof(struct _memreqs));
	if (idx >= 0)
		nmreq--;
}

void freeall_alloted()
{
	int i;

	for (i = nmreq-1; i >= 0; i--)
		if (shmctl(memreqs[i].memid, IPC_RMID, 0) < 0)
			ort_warning("shmctl() failed; use ipcs/ipcrm by hand...\n");
	nmreq = 0;
}


void oprc_shmfree(void *p)
{
	int i = get_alloted_from_ptr(p);
	if (shmctl(i, IPC_RMID, 0) == -1)
		ort_error(0, "shmem free failed\n");
	del_alloted(i);
}


void oprc_shmalloc(void **p , size_t size, int *memid)
{
	*memid = shmget(IPC_PRIVATE, size, 0600 | IPC_CREAT);
	if (*memid == -1)
		ort_error(0, "shmem allocation failed\n");
	*p = shmat(*memid, 0, 0);
	if (p == (void **) - 1)
		ort_error(0, "shmem attach failed\n");
	add_alloted(*memid, *p);
}


/**
 * @brief Allocate a shared memory segment
 * 
 * This allocates shared meomory without attatching it to our memory space.
 * The actual attachment occurs through oprc_shm_ptr() (see comments there).
 * 
 * @param size the segment size in bytes
 * @return the shared memory segment ID
 */
int oprc_shm_alloc(size_t size)
{
	int memid = shmget(IPC_PRIVATE, size, 0600 | IPC_CREAT);
	if (memid == -1)
		ort_error(0, "shmem allocation failed\n");
	add_alloted(memid, NULL);
	return memid;
}


/**
 * @brief Get an actual pointer to a shared memory segment.
 *        
 * Assuming a valid segment ID, the first time this is called it attaches 
 * the segment to our memory space and marks it in the allocation requests 
 * table. The next time it gets called, it just retrieves the pointer.
 * This is quite handy for dynamic allocations which occur *after* 
 * processes have been spawned. For allocations which can be done before 
 * forking, oprc_shmalloc() should be used because children will inherit 
 * all the relevant pointers.
 * 
 * @param memid the shared memory segment ID
 * @return a valid pointer to the shared memory segment
 */
void *oprc_shm_ptr(int memid)
{
	int idx = get_alloted_from_memid(memid);
	
	if (idx == -1) /* Not found; add it to the table */
	{
		idx = nmreq++;
		memreqs[idx].memid = memid;
		memreqs[idx].mem   = NULL;
	}
	if (memreqs[idx].mem == NULL) /* Unattached; attach it. */
		if ((memreqs[idx].mem = shmat(memid, 0, 0))  == (void **) - 1)
			ort_error(0, "[oprc_shm_ptr]: shmat failed for memid %d\n", memid);
	return ( memreqs[idx].mem );
}


int oprc_initialize(int *argc, char ***argv, ort_icvs_t *icv, ort_caps_t *caps)
{
	pid_t id;
	int   nthr;

	/* Allocate space for a maximum allowable number of locks */
	oprc_shmalloc((void **) &lock_counter, (size_t)sizeof(int),
	              (int *) &lock_counter_id);
	oprc_shmalloc((void **) &slock, (size_t) MAXSHMLOCKS * sizeof(oprc_lock_t),
	              (int *) &slock_id);
	*lock_counter = 0;

	nthr = (icv->nthreads > 0) ?  /* Explicitely requested population */
	         icv->nthreads :
	         icv->ncpus - 1;      /* Use a pool of #cpus threads otherwise */
	caps->supports_nested            = 0;
	caps->supports_dynamic           = 1;
	caps->supports_nested_nondynamic = 0;
	caps->max_levels_supported       = 1;
	caps->default_numthreads         = nthr;
	caps->max_threads_supported      = 1 << 30;     /* No limit */
	caps->supports_proc_binding      = 0;

	if (!eelib_inited)
		eelib_inited = 1;
	return 0;
}


/* This is called at exit(); consequently, it will be called by the
 * thread that met the exit(). Thus it will be called by the runner 
 * thread.
 */
void oprc_finalize(int exitvalue)
{
	/* The following is rather useless since the initial thread won't have
	 * a chance of doing anything...
	 */
	shutdown = 1;
	FENCE;
	pthread_mutex_lock(&waitforparallel_lock);
	pthread_cond_signal(&waitforparallel_cv);
	pthread_mutex_unlock(&waitforparallel_lock);

	/* Not useless: cleanup shared memory */
	freeall_alloted();
}


/* Request for "numthr" threads to execute parallelism in level "level".
 * We only support 1 level of parallelism.
 */
int oprc_request(int numthr, int level, int oversubscribe)
{
	return ((level == 1) ? numthr : 0);
}


void oprc_create(int numthr, int level, void *arg, void **ignore)
{
	if (numthr <= 0 || level > 1) return;

	process_num = numthr;
	team_argument = arg;
	FENCE;

	/* Wake up the initial thread to generate the team */
	pthread_mutex_lock(&waitforparallel_lock);
	pthread_mutex_lock(&paralleldone_lock);
	pthread_cond_signal(&waitforparallel_cv);
	pthread_mutex_unlock(&waitforparallel_lock);

	/* Block till the team has been generated */
	pthread_cond_wait(&paralleldone_cv, &paralleldone_lock);
	pthread_mutex_unlock(&paralleldone_lock);
}


/* Only the master (runner) thread can call this.
 * It blocks the thread waiting for all its children to finish their job.
 */
void oprc_waitall(void **ignore)
{
	int i;
	for (i = 0; i < process_num; i++)
		wait(NULL);
}


int oprc_bindme(int **places, int pindex)
{
	return (-1); /* TODO: No binding for now */
}


int oprc_getselfid(void)
{
	return __mypid;
}


void *oprc_getself(unsigned int *size)
{
	if (size)
		*size = sizeof(pid_t);
	return ( (void *) &__mypid );
}


int oprc_pid()
{
	return __myrank;
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                 *
 *  LOCKS                                                          *
 *                                                                 *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


void *oprc_init_lock(oprc_lock_t *lock, int type)
{
	if (lock == (oprc_lock_t *) - 1)
	{
		lock = (slock + (*lock_counter));
		*lock_counter = ((*lock_counter) + 1) % 100;
		FENCE;
	}

	switch (lock->lock.type = type)
	{
		case ORT_LOCK_NEST:
		{
			lock->lock.val = 0;
			lock->lock.count = 0;
			return (lock);
		}

		default: /* ORT_LOCK_NORMAL && ORT_LOCK_SPIN*/
		{
			lock->lock.val = 0;
			return (lock);
		}
	}
}


int oprc_destroy_lock(oprc_lock_t *lock)
{
	return 0;
}


int oprc_set_lock(oprc_lock_t *lock)
{
	switch (lock->lock.type)
	{
		case ORT_LOCK_NEST:
		{
			if (_cas(&(lock->lock.val), 0, 1))
			{
				lock->lock.owner = getpid(); /* Get ownership */
				lock->lock.count++;
			}
			else
			{
				if (lock->lock.owner == getpid())  /* Did i do it? */
					lock->lock.count++;
				else                                    /* Locked by someone else */
				{
#if defined(HAVE_ATOMIC_CAS)
					while (! _cas(&(lock->lock.val), 0, 1)) {}
#endif
					lock->lock.owner = getpid();
					lock->lock.count++;
				}
			}
			return (0);
		}

		default: /* ORT_LOCK_NORMAL && ORT_LOCK_SPIN*/
		{
#if defined(HAVE_ATOMIC_CAS)
			while (! _cas(&(lock->lock.val), 0, 1)) {}
#endif
			return 0;
		}
	}
}


int oprc_unset_lock(oprc_lock_t *lock)
{
	switch (lock->lock.type)
	{
		case ORT_LOCK_NEST:
		{
			if (lock->lock.owner == getpid() && lock->lock.count > 0)
				lock->lock.count--;

			if (lock->lock.owner == getpid() && lock->lock.count == 0)
				lock->lock.val = 0;
			return 0;
		}

		default: /* ORT_LOCK_NORMAL && ORT_LOCK_SPIN*/
		{
			lock->lock.val = 0;
			FENCE;
			return 0;
		}
	}
}


int oprc_test_lock(oprc_lock_t *lock)
{
	int delay = 1, nest_delay = 1;

	switch (lock->lock.type)
	{
		case ORT_LOCK_NEST:
		{
			if (_cas(&(lock->lock.val), 0, 1))
			{
				lock->lock.owner = getpid(); /* Get ownership */
				lock->lock.count++;
			}
			else
			{
				if (lock->lock.owner == getpid())  /* Did i do it? */
					lock->lock.count++;
				else                                    /* Locked by someone else */
				{
#if defined(HAVE_ATOMIC_CAS)
					while (! _cas(&(lock->lock.val), 0, 1)) {}
#endif
					lock->lock.owner = getpid();
					lock->lock.count++;
				}
			}
			return (0);
		}

		default: /* ORT_LOCK_NORMAL && ORT_LOCK_SPIN*/
		{
#if defined(HAVE_ATOMIC_CAS)
			while (! _cas(&(lock->lock.val), 0, 1)) {}
#endif
			return 1;
		}
	}
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                 *
 *  TASKS                                                          *
 *                                                                 *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


static ort_task_node_t *currtasknode;


void oprc_start_implicit_task(void **eeinfo, void *parent_eeinfo)
{
	ort_task_node_t *myt, *part = (ort_task_node_t *) parent_eeinfo;
	
	if ((myt = ort_calloc(sizeof(ort_task_node_t))) == NULL)
		return;
	myt->icvs = part->icvs;
	oprc_set_currtask(eeinfo, myt);
}


void oprc_new_task(int final, int untied, void *(*func)(void *arg), void *arg)
{
	/* Trivial implementations: execute immediately */
	/* TODO: need to create a legitimate task node so that at least we
	 *       get ICVs...
	 */
	ort_task_node_t *t, *mom = currtasknode;
	
	if ((t = ort_calloc(sizeof(ort_task_node_t))) == NULL)
		return;
	if (currtasknode)
		t->icvs = currtasknode->icvs;   /* Inherit ICVs */
	t->parent = mom;
	currtasknode = t;

	(*func)(arg);                     /* Execute */

	currtasknode = mom;               /* Restore current task node */
	free(t);
}


void oprc_taskwait(int how, void *info, int thread_num)
{
	/* empty */
}


void oprc_set_currtask(void **eeinfo, ort_task_node_t *tw)
{
	*eeinfo = (void *) (currtasknode = tw);
}


ort_task_node_t *oprc_get_currtask(void **eeinfo, int thread_num)
{
	return (ort_task_node_t *) *eeinfo;
}


void *oprc_taskenv_alloc(int size, void *task_func)
{
	return ort_alloc(size);
}


void oprc_taskenv_free(void *arg)
{
	free(arg);
}


/* dummy functions */
void *oprc_task_immediate_start(int final)
{
	/* empty */
}

void oprc_task_immediate_end(void *new_node)
{
	/* empty */
}


/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
 *                                                                 *
 *  THE BASIC MECHANICS: MAIN & RUNNER THREADS                     *
 *                                                                 *
 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */


int  prog_argc;
char **prog_argv;


/* The function of the runner thread */
void *oprc__ompi_main(void *ignore)
{
	extern int __ompi_main(int argc, char **argv);  /* generated by ompicc */
	
	__myrank = 0;                            /* Remember my identification */
	__mypid = getpid();	
	__ompi_main(prog_argc, prog_argv);           /* Call the user's main() */
	return NULL;
}


/* Create N processes to execute a parallel region */
static int pfork(int N)
{
	for (; N > 0; N--)
		if (fork() == 0)      /* Child */
		{
			__myrank = N;                      /* Remember my identification */
			__mypid = getpid();
			ort_ee_dowork(N, team_argument);   /* Execute the requested code */
			_exit(0);                          /* That's all; no atexit handlers */
		};

	return (0);             /* Parent */
}


/**
 * This is the initial thread. It serves two purposes:
 * a) create the "runner" thread that will execute the user's main() (which
 *    will also initialize ORT)
 * b) wait for parallel regions and fork processes to execute the regions
 */
int main(int argc, char **argv)
{
	pthread_attr_t tattr;
	pthread_mutexattr_t mattr;
	pthread_t tid;
	int ret = 0, memid;

	prog_argc = argc;
	prog_argv = argv;
	
	pthread_mutexattr_init(&mattr);
	pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
	pthread_mutex_init(&waitforparallel_lock, &mattr);
	pthread_mutex_init(&paralleldone_lock, &mattr);
	pthread_cond_init(&waitforparallel_cv, NULL);
	pthread_cond_init(&paralleldone_cv, NULL);

	/* Allocate space for the runner thread stack (in shared memory) */
	oprc_shmalloc((void **) &p, (size_t) RUNNER_STACKSIZE, (int *) &memid);
	ret = pthread_attr_init(&tattr);
	if (ret)
		ort_error(0, "pthread_attr_init error\n");
	ret = pthread_attr_setstack(&tattr, p, RUNNER_STACKSIZE);
	if (ret)
		ort_error(0, "pthread_attr_setstackaddr error\n");

	/* Create the runner thread to execute the original main */
	pthread_mutex_lock(&waitforparallel_lock);
	ret = pthread_create(&tid, &tattr, oprc__ompi_main, NULL);
	if (ret != 0)
		ort_error(0, "pthread_create error %d\n", ret);

	/* Loop listening for parallel region requests */
	while (!shutdown)
	{
		/* Block till the next parallel region request (from the runner thread) */
		pthread_cond_wait(&waitforparallel_cv, &waitforparallel_lock);
		pthread_mutex_unlock(&waitforparallel_lock);

		if (!shutdown)
			pfork(process_num);

		pthread_mutex_lock(&paralleldone_lock);
		pthread_mutex_lock(&waitforparallel_lock);
		pthread_cond_signal(&paralleldone_cv);     /* wakeup the runner thread */
		pthread_mutex_unlock(&paralleldone_lock);
	}

	/* Cleanup (will never reach here since the runner will exit us...) */
	freeall_alloted();

	pthread_attr_destroy(&tattr);
	pthread_mutex_destroy(&waitforparallel_lock);
	pthread_mutex_destroy(&paralleldone_lock);

	pthread_cond_destroy(&waitforparallel_cv);
	pthread_cond_destroy(&paralleldone_cv);

	return 0;
}
