/* ---------------------------------------------------------------------
 * Program: nldcalc.C  
 *  Author: Steven M. Boker
 *    Date: Thu Mar 28 13:03:43 EST 1996
 *
 * Creates a nonlinear dependency matrix by calculating Mutual Information
 * within and between multivariate time series read from a data file.
 * ---------------------------------------------------------------------
 * Revision History --
 *   Steven M. Boker -- Sat Feb 17 21:31:56 EST 1996
 *      Create file, write .dat file I/O, banners and command line options.
 *   Steven M. Boker -- Thu Mar 28 13:04:16 EST 1996
 *      Create nldcalc.c from nldmatrix.c.
 *   Tom Breeden     -- Aug 28 1998
 *      For Borland C v4.5 32 bit compiler
 *       -bug in calcBins() corrected.
 *
 *      originally was "tEpsilon = (tMax - tMin) / (totalBins -1)
 *      changed to "tEpsilon := (theDataMax[theVar] - theDataMin[theVar]) / LFLOAT(totalBins)"
 *	before, it was more or less dividing the data into totalBins-1 bins, not totalbins bins,
 *	except that data values at the maximum might go into the last bin (ie, the one with index
 *	totalbins-1).
 *      now, it does it correctly. A check was added for data values exactly at the maximum, and these
 *	are put into the last bin.
 *
 *	-bug in mutualInfo() corrected.
 *      originally was "for (i = 0; i < theDataLen; ++i) {"
 *	changed to process tDataLen (ie, VAL(INTEGER, theDataLen) - theLag) points rather than
 *	theDataLen points.
 *	before, it was referencing theBinIndex[var2][i+theLag], which was unassigned for values
 *	of the second index greater that theDataLen. (probably was 0).
 *
 *   Steven M. Boker -- Tue Sep  1 10:05:06 EST 1998
 *      Version 1.1
 *      Incorporated bug fixes from Tom Breeden.
 *      Allowed output of negative lags without having to double calculation time.
 * ---------------------------------------------------------------------
 */

#include <stdio.h>
#include <sys/file.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>


#define NLDM_VARIABLESMAX	0x10		/* maximum number of variables that can be analyzed */
#define NLDM_DATAMAX		0x2000		/* maximum number of data points that can be analyzed */
#define INPUT_STRING_MAX	0x1000		/* maximum length of an input line in a data file */
#define BINS_MAX			0x20		/* maximum number of probability bins */
#define LAGS_MAX			0x200		/* maximum number of lags > 0 */

extern	int	tempopen(void);			/* open files */
extern	int	tempclose(void);		/* close files */
extern	double mutualInfo(int var1, int var2);		/* calculate mutual information for two series */
extern	int calcBins(void);		/* calculate probability bins for all series */


static	FILE	*inputFile;		/* file identifier for the input file */
static	FILE	*outputFile;	/* file identifier for the output file */
static	char	infilename[0x400];	/* ASCII data input file name */
static	char	outfilename[0x400];	/* output file name */
static double theDataMatrix[NLDM_VARIABLESMAX][NLDM_DATAMAX];	/* the data matrix read in */
static unsigned int theBinIndex[NLDM_VARIABLESMAX][NLDM_DATAMAX];	/* the data matrix read in */
static double theNLDMatrix[NLDM_VARIABLESMAX][NLDM_VARIABLESMAX];	/* the NLD matrix */
static double theNLDOutput[LAGS_MAX*2][NLDM_VARIABLESMAX*NLDM_VARIABLESMAX];	/* the NLD matrix */
static double theDataMax[NLDM_VARIABLESMAX];	/* maximum value for each variable */
static double theDataMin[NLDM_VARIABLESMAX];	/* maximum value for each variable */
static double theDataCuts[NLDM_VARIABLESMAX][BINS_MAX];	/* the cuts for the probability bins */
static double theBins[NLDM_VARIABLESMAX][BINS_MAX];	/* the cuts for the probability bins */
static double theJointBins[BINS_MAX][BINS_MAX];	/* the cuts for the joint probability bins */
static long totalVars = 1;		/* the total number of variables read from the file */
static long totalD = 1;		/* the total number of dimensions to use for embedding */
static long theLag = 0;		/* the lag to use for embedding and mutual information calculation */
static long theLagMax = 100;		/* maximum lag to use for embedding and mutual information calculation */
static int totalBins = 10;	/* number of bins for probability estimates */
static double **tDataPtr;
static double **tNLDMPtr;
static int debugflag = 0;	/* debugflag > 0 turns on the debugging messages */
static long theDataLen = 0;	/* actual length of the data */

int main(argc, argv) /* Calculate a nonlinear dependency matrix */
int     argc;
char   *argv[];
{
	int i, j;
	char *a;
	long lineCount, maxLines;
	int fieldCount, maxFields;
	int whitespaceflag;
	int negativeLagFlag = 1;
	unsigned c;
	char inputString[INPUT_STRING_MAX];

	infilename[0] = outfilename[0] = '\0';

	printf("\nnldcalc 1.2 -- (c) Copyright 1996-2001 Steven M. Boker --\n");
	printf("\n");
	argv++;
	while (argc > 2) {
		if (strcmp(*argv, "-o") == 0) {	/* Output file name */
			--argc;
			++argv;
			strcpy(outfilename, *argv);
		}
		else if (strcmp(*argv, "-tau") == 0) {		/* Time delay to use */
			--argc;
			++argv;
			theLagMax = theLag = atoi(*argv);
		}
		else if (strcmp(*argv, "-taumax") == 0) {		/* Maximum time delay to use */
			--argc;
			++argv;
			theLagMax = atoi(*argv);
		}
		else if (strcmp(*argv, "-d") == 0) {		/* Maximum number of embedding dimensions */
			--argc;
			++argv;
			totalD = atoi(*argv);
		}
		else if (strcmp(*argv, "-bins") == 0) {		/* Number of bins for probability estimates */
			--argc;
			++argv;
			totalBins = atoi(*argv);
		}
		else if (strcmp(*argv, "-debug") == 0) {	/* debugging-- print debugging information */
			--argc;
			++argv;
			debugflag = atoi(*argv);
		}
		else
			break;
		--argc;
		++argv;
	}
	if (theLagMax >= LAGS_MAX)
		theLagMax = LAGS_MAX - 1;
	if (theLag < 0)
		theLag = 0;
	if (theLag > 0)
		negativeLagFlag = 0;
	if (argc < 2) {
		printf("    1. Reads a multi-column ASCII data file of multivariate time series\n");
		printf("    2. Calculates average mutual information within and between each series.\n");
		printf("    3. Writes nonlinear dependencies to stdout or output file.\n");
		printf("\n");
		printf("Usage:  nldcalc [-o outfile] [-taumax | -tau n] [-d n] [-bins n] [-debug n] infile\n\n");
		printf("   infile       = Name of ascii data file\n");
		printf("   [-o outfile] = File to output NLD matrix         (default=none)\n");
		printf("   [-taumax n]  = Maximum time delay (if range)     (default=100, max=512)\n");
		printf("   [-tau n]     = Time delay (if single delay only) (default=0)\n");
		printf("                  If tau <= 0 then negative and positive lags are output\n");
		printf("   [-d n]       = Number of embedding dimensions    (default=1)\n");
		printf("   [-bins n]    = Number of bins for probabilities  (default=10)\n");
		printf("   [-debug n]   = Print debugging information       (default=0)\n");
		printf("                  0=off, 1=some, ... 5=nauseating detail\n\n");
		printf("   The output has a column for each element in the nonlinear dependency\n");
		printf("   matrix.  Each row in the output is one nonlinear dependency matrix\n");
		printf("   read out row-wise\n");
		printf("\n");
		exit(-1);
	}
	strcpy(infilename, *argv);	/* input file name from command line */
	if (tempopen() < 0) {
		printf("\n");
		tempclose();
		exit(-1);
	}
	i = j = 0;
	tDataPtr = theDataMatrix;
	tNLDMPtr = theNLDMatrix;
	whitespaceflag = 0;
	fieldCount = lineCount = maxFields = 0;
	while ((c = fgetc(inputFile)) != EOF && lineCount < NLDM_DATAMAX && fieldCount < NLDM_VARIABLESMAX) {
		if (c > ' ' && c <= '~' && whitespaceflag == 0) {
			++fieldCount;
			++whitespaceflag;
		}
		else if (c == '\n') {	
			if (maxFields < fieldCount)
				maxFields = fieldCount;
			fieldCount = 0;
			++lineCount;
		}
		else if (c <= ' ') {
			whitespaceflag = 0;
		}
	}
	if (fieldCount > 0) {
		if (maxFields < fieldCount)
			maxFields = fieldCount;
		fieldCount = 0;
		++lineCount;
	}
	maxLines = lineCount;
	totalVars = maxFields;
	theDataLen = maxLines;
	
	if (debugflag > 0) {
		printf("\nmaxLines=%ld  maxFields=%d\n\n", maxLines, maxFields);
	}

	rewind(inputFile);
	i = j = 0;
	while (fgets(inputString, INPUT_STRING_MAX, inputFile) != NULL) {
		a = inputString;
		if (debugflag > 4) {
			printf("%s", inputString);
		}
		for (j = 0; j < maxFields && *a != '\0'; ++j) {
			sscanf(a, "%lf", &(theDataMatrix[j][i]));
			if (debugflag > 4) {
				printf("%s", a);
				printf("i=%d  j=%d  data=%10f\n", i, j, theDataMatrix[j][i]);
			}
			while (*a <= ' ' && *a != '\0') {
				++a;
			}
			while (*a > ' ') {
				++a;
			}
		}
		++i;
	}
	
	if (debugflag > 3) {
		printf("\n");
		for (i = 0; i < maxLines; ++i) {
			for (j = 0; j < maxFields; ++j) {
				printf("%10lf ", theDataMatrix[j][i]);
			}
			printf("\n");
		}
	}
	
	if (debugflag > 0) {
		printf("\ntheDataLen=%d  totalBins=%d\n", theDataLen, totalBins);
	}

	calcBins();
	if (negativeLagFlag) {
		for (theLag = 0; theLag <= theLagMax; ++theLag) {
			for (i = 0; i < totalVars; ++i) {
				for (j = 0; j < totalVars; ++j) {
					theNLDOutput[theLag][(i*totalVars)+j] = mutualInfo(i, j);
				}
			}
		}
		for (theLag = theLagMax; theLag > 0; --theLag) {
			if (outputFile) {
				for (j = 0; j < totalVars; ++j) {
					for (i = 0; i < totalVars; ++i) {
						fprintf(outputFile, "%8f ", theNLDOutput[theLag][(i*totalVars)+j]);
					}
				}
				fprintf(outputFile, "\n");
			}
			else {
				for (j = 0; j < totalVars; ++j) {
					for (i = 0; i < totalVars; ++i) {
						printf("%8f ", theNLDOutput[theLag][(i*totalVars)+j]);
					}
				}
				printf("\n");
			}
		}
		for (theLag = 0; theLag <= theLagMax; ++theLag) {
			if (outputFile) {
				for (i = 0; i < totalVars; ++i) {
					for (j = 0; j < totalVars; ++j) {
						fprintf(outputFile, "%8f ", theNLDOutput[theLag][(i*totalVars)+j]);
					}
				}
				fprintf(outputFile, "\n");
			}
			else {
				for (i = 0; i < totalVars; ++i) {
					for (j = 0; j < totalVars; ++j) {
						printf("%8f ", theNLDOutput[theLag][(i*totalVars)+j]);
					}
				}
				printf("\n");
			}
		}
	}
	else {
		for ( ; theLag <= theLagMax; ++theLag) {
			for (i = 0; i < totalVars; ++i) {
				for (j = 0; j < totalVars; ++j) {
					theNLDMatrix[i][j] = mutualInfo(i, j);
				}
			}
			if (outputFile) {
				for (i = 0; i < totalVars; ++i) {
					for (j = 0; j < totalVars; ++j) {
						fprintf(outputFile, "%8f ", theNLDMatrix[i][j]);
					}
				}
				fprintf(outputFile, "\n");
			}
			else {
				for (i = 0; i < totalVars; ++i) {
					for (j = 0; j < totalVars; ++j) {
						printf("%8f ", theNLDMatrix[i][j]);
					}
				}
				printf("\n");
			}
		}
	}
	tempclose();
	printf("\n");
	return(0);
}

/*-------------------------------------------------------------------------*/

int tempopen(void)

{
	int     i;
	FILE   *fopen();

	if (infilename[0] != '\0') {
		if ((inputFile = fopen(infilename, "r")) == NULL) {
			printf("ERROR: Input file named [%s] could not be opened. \n", infilename);
			return (-1);
		}
	}
	printf("     Input file = %s \n", infilename);

	if (outfilename[0] != '\0') {
		if ((outputFile = fopen(outfilename, "w")) == NULL) {
			printf("ERROR: Output file named [%s] could not be created.\n", outfilename);
			return (-1);
		}
	}
	printf("    Output file = %s \n", outfilename);

	return (0);
}

/*-------------------------------------------------------------------------*/

int tempclose(void)
{				/* close files */
	int     err = -1;

	if (inputFile)
		err = (fclose(inputFile) == EOF) ? -1 : 0;
	if (outputFile)
		err = (fclose(outputFile) == EOF || err) ? -1 : 0;
	printf("\n");
	return (err);
}

/*-------------------------------------------------------------------------*/

int calcBins(void)
{				/* calculate the univariate probability bins for all time series */
	int err = -1;
	int theVar;
	double *theVarPtr, *tPtr;
	double tMax, tMin, tEpsilon, tBin;
	int i, j;
	
	for (theVar = 0; theVar < totalVars; ++theVar) {
		theVarPtr = &(theDataMatrix[theVar][0]);
		tPtr = theVarPtr;
		tMax = tMin = *tPtr;
		for (i = 0; i < theDataLen; ++i, ++tPtr) {
			if (tMax < *tPtr)
				tMax = *tPtr;
			if (tMin > *tPtr)
				tMin = *tPtr;
		}
		theDataMax[theVar] = tMax;	
		theDataMin[theVar] = tMin;
		tEpsilon = (tMax - tMin) / (float)totalBins;
		tBin = tMin;
		for (i = 0; i <= totalBins; ++i) {
			theDataCuts[theVar][i] = tBin;
			theBins[theVar][i] = 0.0;
			tBin += tEpsilon;
		}
		tPtr = theVarPtr;
		for (i = 0; i < theDataLen; ++i, ++tPtr) {
			j = (((int)((*tPtr - tMin) / tEpsilon)) < (totalBins-1)) ?
			     ((int)((*tPtr - tMin) / tEpsilon)) : (totalBins-1);
			theBinIndex[theVar][i] = j;
			theBins[theVar][j] += 1.0;
		}
		for (i = 0; i < totalBins; ++i) {
			theBins[theVar][i] /= theDataLen;
		}
	}
	if (debugflag > 0) {
		printf("\n\ntheDataMax ---------------------------------\n");
		for (j = 0; j < totalVars; ++j) {
			printf("%12f ", theDataMax[j]);
		}
		printf("\n");
		printf("\n\ntheDataMin ---------------------------------\n");
		for (j = 0; j < totalVars; ++j) {
			printf("%12f ", theDataMin[j]);
		}
		printf("\n");
	}

	if (debugflag > 0) {
		printf("\n\ntheBins ---------------------------------\n");
		for (i = 0; i < totalBins; ++i) {
			for (j = 0; j < totalVars; ++j) {
				printf("%12f ", theBins[j][i]);
			}
			printf("\n");
		}
	}

	if (debugflag > 0) {
		printf("\n\ntheDataCuts ---------------------------------\n");
		for (i = 0; i < totalBins; ++i) {
			for (j = 0; j < totalVars; ++j) {
				printf("%12f ", theDataCuts[j][i]);
			}
			printf("\n");
		}
	}
	err = 0;
	return (err);
}

/*-------------------------------------------------------------------------*/

int calcJointBins(int var1, int var2)
{				/* calculate the bivariate probability bins for two time series */
	int     err = -1;
	double tBin;
	int tDataLen;
	int i, j, k;
	double tSum;

	tDataLen = theDataLen - theLag;
	for (i = 0; i < totalBins; ++i) {
		for (j = 0; j < totalBins; ++j) {
			tBin = 0.0;
			for (k = 0; k < tDataLen; ++k) {
				if (theBinIndex[var1][k] == i && theBinIndex[var2][k+theLag] == j) {
					tBin += 1.0;
				}
			}
			theJointBins[i][j] = tBin;
		}
	}
	for (i = 0; i < totalBins; ++i) {
		for (j = 0; j < totalBins; ++j) {
			theJointBins[i][j] /= tDataLen;
		}
	}
	if (debugflag > 0) {
		tSum = 0.0;
		printf("\n\ntheJointBins ---------------------------------\n");
		for (i = 0; i < totalBins; ++i) {
			for (j = 0; j < totalBins; ++j) {
				printf("%12f ", theJointBins[i][j]);
				tSum += theJointBins[i][j];
			}
			printf("\n");
		}
		printf("tSum=%12f\n", tSum);
	}
	err = 0;
	return (err);
}

/*-------------------------------------------------------------------------*/

double mutualInfo(int var1, int var2)
{				/* calculate the bivariate probability bins for two time series */
	int     err = -1;
	int tDataLen;
	int i, j, k;
	double tJoint, tInfo, tProduct;

	tDataLen = theDataLen - theLag;
	calcJointBins(var1, var2);
	tInfo = 0.0;
	for (i = 0; i < tDataLen; ++i) {
		j = theBinIndex[var1][i];
		k = theBinIndex[var2][i+theLag];
		tJoint = theJointBins[j][k];
		tProduct = theBins[var1][j] * theBins[var2][k];
		if (tJoint > 0.0 && tProduct > 0.00)
			tInfo += tJoint * log(tJoint / tProduct) * 1.442695;
	}
	tInfo /= tDataLen;

	return (tInfo);
}

