/* --------------------------------------------------------------------- * Program: nldcalc.C * Author: Steven M. Boker * Date: Thu Mar 28 13:03:43 EST 1996 * * Creates a nonlinear dependency matrix by calculating Mutual Information * within and between multivariate time series read from a data file. * --------------------------------------------------------------------- * Revision History -- * Steven M. Boker -- Sat Feb 17 21:31:56 EST 1996 * Create file, write .dat file I/O, banners and command line options. * Steven M. Boker -- Thu Mar 28 13:04:16 EST 1996 * Create nldcalc.c from nldmatrix.c. * Tom Breeden -- Aug 28 1998 * For Borland C v4.5 32 bit compiler * -bug in calcBins() corrected. * * originally was "tEpsilon = (tMax - tMin) / (totalBins -1) * changed to "tEpsilon := (theDataMax[theVar] - theDataMin[theVar]) / LFLOAT(totalBins)" * before, it was more or less dividing the data into totalBins-1 bins, not totalbins bins, * except that data values at the maximum might go into the last bin (ie, the one with index * totalbins-1). * now, it does it correctly. A check was added for data values exactly at the maximum, and these * are put into the last bin. * * -bug in mutualInfo() corrected. * originally was "for (i = 0; i < theDataLen; ++i) {" * changed to process tDataLen (ie, VAL(INTEGER, theDataLen) - theLag) points rather than * theDataLen points. * before, it was referencing theBinIndex[var2][i+theLag], which was unassigned for values * of the second index greater that theDataLen. (probably was 0). * * Steven M. Boker -- Tue Sep 1 10:05:06 EST 1998 * Version 1.1 * Incorporated bug fixes from Tom Breeden. * Allowed output of negative lags without having to double calculation time. * --------------------------------------------------------------------- */ #include #include #include #include #include #define NLDM_VARIABLESMAX 0x10 /* maximum number of variables that can be analyzed */ #define NLDM_DATAMAX 0x2000 /* maximum number of data points that can be analyzed */ #define INPUT_STRING_MAX 0x1000 /* maximum length of an input line in a data file */ #define BINS_MAX 0x20 /* maximum number of probability bins */ #define LAGS_MAX 0x200 /* maximum number of lags > 0 */ extern int tempopen(void); /* open files */ extern int tempclose(void); /* close files */ extern double mutualInfo(int var1, int var2); /* calculate mutual information for two series */ extern int calcBins(void); /* calculate probability bins for all series */ static FILE *inputFile; /* file identifier for the input file */ static FILE *outputFile; /* file identifier for the output file */ static char infilename[0x400]; /* ASCII data input file name */ static char outfilename[0x400]; /* output file name */ static double theDataMatrix[NLDM_VARIABLESMAX][NLDM_DATAMAX]; /* the data matrix read in */ static unsigned int theBinIndex[NLDM_VARIABLESMAX][NLDM_DATAMAX]; /* the data matrix read in */ static double theNLDMatrix[NLDM_VARIABLESMAX][NLDM_VARIABLESMAX]; /* the NLD matrix */ static double theNLDOutput[LAGS_MAX*2][NLDM_VARIABLESMAX*NLDM_VARIABLESMAX]; /* the NLD matrix */ static double theDataMax[NLDM_VARIABLESMAX]; /* maximum value for each variable */ static double theDataMin[NLDM_VARIABLESMAX]; /* maximum value for each variable */ static double theDataCuts[NLDM_VARIABLESMAX][BINS_MAX]; /* the cuts for the probability bins */ static double theBins[NLDM_VARIABLESMAX][BINS_MAX]; /* the cuts for the probability bins */ static double theJointBins[BINS_MAX][BINS_MAX]; /* the cuts for the joint probability bins */ static long totalVars = 1; /* the total number of variables read from the file */ static long totalD = 1; /* the total number of dimensions to use for embedding */ static long theLag = 0; /* the lag to use for embedding and mutual information calculation */ static long theLagMax = 100; /* maximum lag to use for embedding and mutual information calculation */ static int totalBins = 10; /* number of bins for probability estimates */ static double **tDataPtr; static double **tNLDMPtr; static int debugflag = 0; /* debugflag > 0 turns on the debugging messages */ static long theDataLen = 0; /* actual length of the data */ int main(argc, argv) /* Calculate a nonlinear dependency matrix */ int argc; char *argv[]; { int i, j; char *a; long lineCount, maxLines; int fieldCount, maxFields; int whitespaceflag; int negativeLagFlag = 1; unsigned c; char inputString[INPUT_STRING_MAX]; infilename[0] = outfilename[0] = '\0'; printf("\nnldcalc 1.2 -- (c) Copyright 1996-2001 Steven M. Boker --\n"); printf("\n"); argv++; while (argc > 2) { if (strcmp(*argv, "-o") == 0) { /* Output file name */ --argc; ++argv; strcpy(outfilename, *argv); } else if (strcmp(*argv, "-tau") == 0) { /* Time delay to use */ --argc; ++argv; theLagMax = theLag = atoi(*argv); } else if (strcmp(*argv, "-taumax") == 0) { /* Maximum time delay to use */ --argc; ++argv; theLagMax = atoi(*argv); } else if (strcmp(*argv, "-d") == 0) { /* Maximum number of embedding dimensions */ --argc; ++argv; totalD = atoi(*argv); } else if (strcmp(*argv, "-bins") == 0) { /* Number of bins for probability estimates */ --argc; ++argv; totalBins = atoi(*argv); } else if (strcmp(*argv, "-debug") == 0) { /* debugging-- print debugging information */ --argc; ++argv; debugflag = atoi(*argv); } else break; --argc; ++argv; } if (theLagMax >= LAGS_MAX) theLagMax = LAGS_MAX - 1; if (theLag < 0) theLag = 0; if (theLag > 0) negativeLagFlag = 0; if (argc < 2) { printf(" 1. Reads a multi-column ASCII data file of multivariate time series\n"); printf(" 2. Calculates average mutual information within and between each series.\n"); printf(" 3. Writes nonlinear dependencies to stdout or output file.\n"); printf("\n"); printf("Usage: nldcalc [-o outfile] [-taumax | -tau n] [-d n] [-bins n] [-debug n] infile\n\n"); printf(" infile = Name of ascii data file\n"); printf(" [-o outfile] = File to output NLD matrix (default=none)\n"); printf(" [-taumax n] = Maximum time delay (if range) (default=100, max=512)\n"); printf(" [-tau n] = Time delay (if single delay only) (default=0)\n"); printf(" If tau <= 0 then negative and positive lags are output\n"); printf(" [-d n] = Number of embedding dimensions (default=1)\n"); printf(" [-bins n] = Number of bins for probabilities (default=10)\n"); printf(" [-debug n] = Print debugging information (default=0)\n"); printf(" 0=off, 1=some, ... 5=nauseating detail\n\n"); printf(" The output has a column for each element in the nonlinear dependency\n"); printf(" matrix. Each row in the output is one nonlinear dependency matrix\n"); printf(" read out row-wise\n"); printf("\n"); exit(-1); } strcpy(infilename, *argv); /* input file name from command line */ if (tempopen() < 0) { printf("\n"); tempclose(); exit(-1); } i = j = 0; tDataPtr = theDataMatrix; tNLDMPtr = theNLDMatrix; whitespaceflag = 0; fieldCount = lineCount = maxFields = 0; while ((c = fgetc(inputFile)) != EOF && lineCount < NLDM_DATAMAX && fieldCount < NLDM_VARIABLESMAX) { if (c > ' ' && c <= '~' && whitespaceflag == 0) { ++fieldCount; ++whitespaceflag; } else if (c == '\n') { if (maxFields < fieldCount) maxFields = fieldCount; fieldCount = 0; ++lineCount; } else if (c <= ' ') { whitespaceflag = 0; } } if (fieldCount > 0) { if (maxFields < fieldCount) maxFields = fieldCount; fieldCount = 0; ++lineCount; } maxLines = lineCount; totalVars = maxFields; theDataLen = maxLines; if (debugflag > 0) { printf("\nmaxLines=%ld maxFields=%d\n\n", maxLines, maxFields); } rewind(inputFile); i = j = 0; while (fgets(inputString, INPUT_STRING_MAX, inputFile) != NULL) { a = inputString; if (debugflag > 4) { printf("%s", inputString); } for (j = 0; j < maxFields && *a != '\0'; ++j) { sscanf(a, "%lf", &(theDataMatrix[j][i])); if (debugflag > 4) { printf("%s", a); printf("i=%d j=%d data=%10f\n", i, j, theDataMatrix[j][i]); } while (*a <= ' ' && *a != '\0') { ++a; } while (*a > ' ') { ++a; } } ++i; } if (debugflag > 3) { printf("\n"); for (i = 0; i < maxLines; ++i) { for (j = 0; j < maxFields; ++j) { printf("%10lf ", theDataMatrix[j][i]); } printf("\n"); } } if (debugflag > 0) { printf("\ntheDataLen=%d totalBins=%d\n", theDataLen, totalBins); } calcBins(); if (negativeLagFlag) { for (theLag = 0; theLag <= theLagMax; ++theLag) { for (i = 0; i < totalVars; ++i) { for (j = 0; j < totalVars; ++j) { theNLDOutput[theLag][(i*totalVars)+j] = mutualInfo(i, j); } } } for (theLag = theLagMax; theLag > 0; --theLag) { if (outputFile) { for (j = 0; j < totalVars; ++j) { for (i = 0; i < totalVars; ++i) { fprintf(outputFile, "%8f ", theNLDOutput[theLag][(i*totalVars)+j]); } } fprintf(outputFile, "\n"); } else { for (j = 0; j < totalVars; ++j) { for (i = 0; i < totalVars; ++i) { printf("%8f ", theNLDOutput[theLag][(i*totalVars)+j]); } } printf("\n"); } } for (theLag = 0; theLag <= theLagMax; ++theLag) { if (outputFile) { for (i = 0; i < totalVars; ++i) { for (j = 0; j < totalVars; ++j) { fprintf(outputFile, "%8f ", theNLDOutput[theLag][(i*totalVars)+j]); } } fprintf(outputFile, "\n"); } else { for (i = 0; i < totalVars; ++i) { for (j = 0; j < totalVars; ++j) { printf("%8f ", theNLDOutput[theLag][(i*totalVars)+j]); } } printf("\n"); } } } else { for ( ; theLag <= theLagMax; ++theLag) { for (i = 0; i < totalVars; ++i) { for (j = 0; j < totalVars; ++j) { theNLDMatrix[i][j] = mutualInfo(i, j); } } if (outputFile) { for (i = 0; i < totalVars; ++i) { for (j = 0; j < totalVars; ++j) { fprintf(outputFile, "%8f ", theNLDMatrix[i][j]); } } fprintf(outputFile, "\n"); } else { for (i = 0; i < totalVars; ++i) { for (j = 0; j < totalVars; ++j) { printf("%8f ", theNLDMatrix[i][j]); } } printf("\n"); } } } tempclose(); printf("\n"); return(0); } /*-------------------------------------------------------------------------*/ int tempopen(void) { int i; FILE *fopen(); if (infilename[0] != '\0') { if ((inputFile = fopen(infilename, "r")) == NULL) { printf("ERROR: Input file named [%s] could not be opened. \n", infilename); return (-1); } } printf(" Input file = %s \n", infilename); if (outfilename[0] != '\0') { if ((outputFile = fopen(outfilename, "w")) == NULL) { printf("ERROR: Output file named [%s] could not be created.\n", outfilename); return (-1); } } printf(" Output file = %s \n", outfilename); return (0); } /*-------------------------------------------------------------------------*/ int tempclose(void) { /* close files */ int err = -1; if (inputFile) err = (fclose(inputFile) == EOF) ? -1 : 0; if (outputFile) err = (fclose(outputFile) == EOF || err) ? -1 : 0; printf("\n"); return (err); } /*-------------------------------------------------------------------------*/ int calcBins(void) { /* calculate the univariate probability bins for all time series */ int err = -1; int theVar; double *theVarPtr, *tPtr; double tMax, tMin, tEpsilon, tBin; int i, j; for (theVar = 0; theVar < totalVars; ++theVar) { theVarPtr = &(theDataMatrix[theVar][0]); tPtr = theVarPtr; tMax = tMin = *tPtr; for (i = 0; i < theDataLen; ++i, ++tPtr) { if (tMax < *tPtr) tMax = *tPtr; if (tMin > *tPtr) tMin = *tPtr; } theDataMax[theVar] = tMax; theDataMin[theVar] = tMin; tEpsilon = (tMax - tMin) / (float)totalBins; tBin = tMin; for (i = 0; i <= totalBins; ++i) { theDataCuts[theVar][i] = tBin; theBins[theVar][i] = 0.0; tBin += tEpsilon; } tPtr = theVarPtr; for (i = 0; i < theDataLen; ++i, ++tPtr) { j = (((int)((*tPtr - tMin) / tEpsilon)) < (totalBins-1)) ? ((int)((*tPtr - tMin) / tEpsilon)) : (totalBins-1); theBinIndex[theVar][i] = j; theBins[theVar][j] += 1.0; } for (i = 0; i < totalBins; ++i) { theBins[theVar][i] /= theDataLen; } } if (debugflag > 0) { printf("\n\ntheDataMax ---------------------------------\n"); for (j = 0; j < totalVars; ++j) { printf("%12f ", theDataMax[j]); } printf("\n"); printf("\n\ntheDataMin ---------------------------------\n"); for (j = 0; j < totalVars; ++j) { printf("%12f ", theDataMin[j]); } printf("\n"); } if (debugflag > 0) { printf("\n\ntheBins ---------------------------------\n"); for (i = 0; i < totalBins; ++i) { for (j = 0; j < totalVars; ++j) { printf("%12f ", theBins[j][i]); } printf("\n"); } } if (debugflag > 0) { printf("\n\ntheDataCuts ---------------------------------\n"); for (i = 0; i < totalBins; ++i) { for (j = 0; j < totalVars; ++j) { printf("%12f ", theDataCuts[j][i]); } printf("\n"); } } err = 0; return (err); } /*-------------------------------------------------------------------------*/ int calcJointBins(int var1, int var2) { /* calculate the bivariate probability bins for two time series */ int err = -1; double tBin; int tDataLen; int i, j, k; double tSum; tDataLen = theDataLen - theLag; for (i = 0; i < totalBins; ++i) { for (j = 0; j < totalBins; ++j) { tBin = 0.0; for (k = 0; k < tDataLen; ++k) { if (theBinIndex[var1][k] == i && theBinIndex[var2][k+theLag] == j) { tBin += 1.0; } } theJointBins[i][j] = tBin; } } for (i = 0; i < totalBins; ++i) { for (j = 0; j < totalBins; ++j) { theJointBins[i][j] /= tDataLen; } } if (debugflag > 0) { tSum = 0.0; printf("\n\ntheJointBins ---------------------------------\n"); for (i = 0; i < totalBins; ++i) { for (j = 0; j < totalBins; ++j) { printf("%12f ", theJointBins[i][j]); tSum += theJointBins[i][j]; } printf("\n"); } printf("tSum=%12f\n", tSum); } err = 0; return (err); } /*-------------------------------------------------------------------------*/ double mutualInfo(int var1, int var2) { /* calculate the bivariate probability bins for two time series */ int err = -1; int tDataLen; int i, j, k; double tJoint, tInfo, tProduct; tDataLen = theDataLen - theLag; calcJointBins(var1, var2); tInfo = 0.0; for (i = 0; i < tDataLen; ++i) { j = theBinIndex[var1][i]; k = theBinIndex[var2][i+theLag]; tJoint = theJointBins[j][k]; tProduct = theBins[var1][j] * theBins[var2][k]; if (tJoint > 0.0 && tProduct > 0.00) tInfo += tJoint * log(tJoint / tProduct) * 1.442695; } tInfo /= tDataLen; return (tInfo); }