/* a new nbc.c: Faster Naive Bayesian Classifier */ #include #include #include #include "nbc.h" int No_Features, No_Classes, No_Instances; int No_Trainers, No_Testers, Train_correct, Test_correct; int linear[MAX_FEATURES]; int verbosity=0; char domainName[MAX_NAME], resName[MAX_NAME]; int className[MAX_CLASSES], classIndex[MAX_CLASSES]; INSTANCE trainData[MAX_DATA], testData[MAX_DATA]; int classCount[MAX_CLASSES]; double g[MAX_CLASSES]; char trainDataFile[MAX_NAME], testDataFile[MAX_NAME]; VALCOUNT distribution[MAX_FEATURES][MAX_CLASSES][MAX_DATA]; int noValues[MAX_FEATURES][MAX_CLASSES];/*# of values of feature f of class c*/ FILE *logf; struct timeval *start_train, *end_train, *end_test; struct timezone *tz; int train_time, test_time; /* ------------------------------------------------------------------------ */ int time_diff(struct timeval *finish, struct timeval *start) /* time difference in miliseconds */ { int time_in_sec, time_in_usec; time_in_sec = finish->tv_sec - start->tv_sec; if (time_in_sec < 0) time_in_sec += 60; /* Here, we assume time_diffrenece can not be more than a minute */ time_in_usec = finish->tv_usec - start->tv_usec; return((time_in_sec * 1000) + (time_in_usec / 1000)); } /* time_diff */ /* ------------------------------------------------------------------------ */ float average(float *array, int size) { float sum=0.0; int i; for (i=0; i < size; i++) sum += array[i]; return sum / size; } /* average */ /* ------------------------------------------------------------------------ */ parse_options(int argc, char *argv[]) { strcpy(domainName, argv[1]); if (argc >= 3 && (strcmp(argv[2], "-v")==0)) { if (argc == 3) verbosity = 1; /* default is the least verbosity */ else sscanf(argv[3], "%d", &verbosity); open_log_file(); } } /* parse_options */ /* ------------------------------------------------------------------------ --- inserts the new value on current dimension in increasing order --- */ void insert(INSTANCE x, FEATURE f) { int i, j=0; int done = FALSE; CLASS c; for (j=0;( (j < noValues[f][x.class]) && (!done) ); j++) { if ( distribution[f][x.class][j].value >= x.value[f] ) done = TRUE; /* a larger or equal value of x.value[f] is seen before */ } /* j-1 is the index of that larger or equal value in the array */ if ( done ) { if ( x.value[f] == distribution[f][x.class][j-1].value ) distribution[f][x.class][j-1].count++; /*if x.value[f] has seen before*/ else { /* a larger value than x.value[f] has seen before */ for (i=noValues[f][x.class]; i>=j; i--) { /* shift right */ distribution[f][x.class][i].value =distribution[f][x.class][i-1].value; distribution[f][x.class][i].count =distribution[f][x.class][i-1].count; } distribution[f][x.class][j-1].value = x.value[f]; distribution[f][x.class][j-1].count = 1; noValues[f][x.class]++; } } else { /* x.value[f] is the largest value seen so far, put it at the end */ distribution[f][x.class][j].value = x.value[f]; distribution[f][x.class][j].count = 1; noValues[f][x.class]++; } } /* insert */ /* ------------------------------------------------------------------------ */ void train () { FEATURE f; CLASS nowc; int index; for (index=0; index < No_Trainers; index++) { for (f=0; f < No_Features; f++) { if (trainData[index].status[f] == KNOWN) { nowc = trainData[index].class; if (noValues[f][nowc] == 0) { /* first value will be put */ distribution[f][nowc][0].value = trainData[index].value[f]; distribution[f][nowc][0].count = 1; noValues[f][nowc] = 1; } else /* not first value on the current [f, c] dimension */ insert(trainData[index], f); } } } } /* train */ /* ------------------------------------------------------------------------ */ void initialize() { int i, f, c; read_info(); if (verbosity > 0) { fprintf(logf,"No_Features: %d\n", No_Features); fprintf(logf,"No_Classes: %d\n", No_Classes); fflush(logf); } No_Trainers = read_train_data(); /* initializes trainData from domain.train*/ for (i=0; i 3) { fprintf(logf, "train data:\n"); log_data(trainData, No_Trainers); fprintf(logf, "test data:\n"); log_data(testData, No_Testers); } if (verbosity > 0) { fprintf(logf,"No_Trainers: %d\n", No_Trainers); fprintf(logf,"No_Testers: %d\n", No_Testers); for (c=0; c <= No_Classes; c++) fprintf(logf, "classCount[%d]:%d ", className[c], classCount[c]); fprintf(logf, "\n"); fflush(logf); } train (); /* computes the distributions */ } /* initialize */ float p(VALUE x, FEATURE f, CLASS c) { int i, f1, c1; /* instance, feature, and class index */ float density=0.0; int done=FALSE; if ( (classCount[c] == 0) || (noValues[f][c] == 0) ) return(0.0); if ( !linear[f] ) { /* nominal */ for (i=0; i2) fprintf(logf, "c: %d, f: %d density: %g\n", className[c], f, density); if (verbosity>3) { for (f1=0; f1 < No_Features; f1++) { for (c1=1; c1<=No_Classes; c1++) { for (i=0; i 0) fprintf(logf,"\n"); if (verbosity > 2) { fprintf (logf, "Test instance no: %d\n", i); for (f=0; f< No_Features; f++) fprintf(logf, "%g ", testData[i].value[f]); fprintf(logf, "\n"); fflush(logf); } for (c=0; c <= No_Classes; c++) { g[c] = (float)classCount[c] / No_Trainers; for (f=0; f< No_Features; f++) if (testData[i].status[f] == KNOWN) g[c] = g[c] * p(testData[i].value[f], f, c); if (g[c] > g[prediction]) prediction = c; /* find c with max g[c] */ } if (prediction == testData[i].class) correct_count++; if (verbosity > 1) { for (c=0; c <= No_Classes; c++) fprintf(logf, "g[%d]:%g ", className[c], g[c]); fprintf(logf, "\n"); } if (verbosity > 0) { fprintf(logf, "%d: prediction: %d actual: %d\n", i, className[prediction], className[testData[i].class]); fflush(logf); } } test_acc = (float)correct_count/No_Testers; if (verbosity>0) fprintf (logf, "Test accuracy: %g\n", test_acc); return (test_acc); } /* nbc */ /* ------------------------------------------------------------------------ */ main(argc, argv) int argc; char *argv[]; { FILE *fopen(), *resultf; float accuracy; int c, i, f; if (argc == 1) { printf("nbc domain -v verbosity\n"); exit(1); } else parse_options(argc,argv); start_train = (struct timeval *) malloc(sizeof(struct timeval)); end_train = (struct timeval *) malloc(sizeof(struct timeval)); end_test = (struct timeval *) malloc(sizeof(struct timeval)); tz = (struct timezone *) malloc(sizeof(struct timezone)); gettimeofday(start_train, tz); initialize(); gettimeofday(end_train, tz); accuracy = nbc(); gettimeofday(end_test, tz); train_time = time_diff (end_train, start_train); test_time = time_diff (end_test, end_train); sprintf(resName,"%s.res", domainName); resultf = fopen(resName, "w"); fprintf (resultf, "Accuracy: %5.3f Train: %d Test: %d msec.\n", accuracy, train_time, test_time); if (verbosity>0) fprintf (logf,"Accuracy: %5.3f Train: %d Test: %d msec.\n", accuracy, train_time, test_time); fclose(resultf); if (verbosity>0) fclose(logf); return 0; } /* main */