/****************************************************************/ /* Copyright 1993 : Johns Hopkins University */ /* Department of Computer Science */ /****************************************************************/ /* Contact : murthy@cs.jhu.edu */ /****************************************************************/ /* File Name : mktree.c */ /* Author : Sreerama K. Murthy */ /* Last modified : October 1993 */ /* Contains modules : main */ /* initialize */ /* deallocate */ /* read_data */ /* build_dt */ /* axis_parallel_split */ /* oblique_split */ /* cross_validate */ /* print_classification_result */ /* print_log */ /* mktree_help */ /* Uses modules in : oc1.h */ /* util.c */ /* load_data.c */ /* classify.c */ /* train_util.c */ /* classify_util.c */ /* compute_impurity.c */ /* perturb.c */ /* prune.c */ /* Is used by modules in : None. */ /* Remarks : This file has the OC1 modules, that */ /* build the decision trees recursively. */ /****************************************************************/ #include "oc1.h" char *pname; char dt_file[LINESIZE],train_data[LINESIZE]; char test_data[LINESIZE],failed_data[LINESIZE]; char log_file[LINESIZE]; int no_of_dimensions=0,no_of_coeffs,no_of_categories=0; int no_of_iterations=20,no_of_folds=0; int unlabeled = FALSE,verbose=FALSE; int order_of_perturbation = SEQUENTIAL; int oblique = TRUE; int coeff_modified = FALSE; int cycle_count=50; int *left_count=NULL, *right_count=NULL; int max_no_of_random_perturbations = 5; int no_of_stagnant_perturbations; int no_of_train_points=0,no_of_test_points=0,no_of_ptest_points=0; float compute_impurity(); float *coeff_array,*best_coeff_array; float prune_portion=0.1; double *temp_val; void srand48(); struct unidim *inequalities; struct test_outcome classify_and_estimate_accuracy(); struct tree_node *prune(); FILE *logfile; POINT **train_points=NULL,**test_points=NULL,**ptest_points=NULL; /* ptest_points stores the part of the training set that is used in pruning. */ int no_of_calls=0,no_of_costly_calls=0; /************************************************************************/ /* Module name : main */ /* Functionality : Accepts user's options as input, sets control */ /* variables accordingly, and invokes the appro- */ /* priate data-reading, tree-building and */ /* classifying routines. */ /* Parameters : argc,argv : See any standard C textbook for details. */ /* Returns : nothing. */ /* Calls modules : mktree_help */ /* print_log */ /* read_data (load_data.c) */ /* initialize */ /* build_dt */ /* write_tree (train_util.c) */ /* cross_validate */ /* print_classification_result */ /* deallocate */ /* read_tree (classify_util.c) */ /* classify_and_estimate_accuracy (classify.c) */ /* classify (classify.c) */ /* prune (prune.c) */ /* Is called by modules : None. */ /************************************************************************/ main (argc, argv) int argc; char *argv[]; { extern char *optarg; extern int optind; int c1,leaf_count(),tree_depth(); int i,no_of_correctly_classified_test_points; struct tree_node *root = NULL,*build_dt(),*read_tree(); struct test_outcome result; float accuracy; strcpy(train_data,"\0"); strcpy(test_data,"\0"); strcpy(dt_file,"\0"); strcpy(failed_data,"\0"); strcpy(log_file,"oc1.log"); pname = argv[0]; if (argc==1) mktree_help(); while ((c1 = getopt (argc, argv, "t:n:T:uV:d:c:abr:vAi:s:m:p:D:M:l:")) != EOF) switch (c1) { case 't': /*Data for training. */ strcpy(train_data,optarg); break; case 'n': no_of_train_points = atoi (optarg); if (no_of_train_points < 0) mktree_help(); break; case 'T': /*Data for testing. */ strcpy(test_data,optarg); break; case 'u': /*Test data is unlabeled. Classify it. */ unlabeled = TRUE; break; case 'V': no_of_folds = atoi (optarg); /* If this is nonzero, V-fold cross-validation is used for estimating the accuracy. Overrides the -T option. This number can be -1, denoting "leave-one-out" cross-validation. */ break; case 'd': no_of_dimensions = atoi (optarg); if (no_of_dimensions <= 0) mktree_help(); break; case 'c': no_of_categories = atoi (optarg); if (no_of_categories <= 0) mktree_help(); break; case 'a': /*Axis parallel splits only */ oblique = FALSE; break; case 'b': if (oblique == FALSE) mktree_help(); order_of_perturbation = BEST_FIRST; break; case 'r': if (oblique == FALSE) mktree_help(); order_of_perturbation = RANDOM; cycle_count = atoi (optarg); break; case 'v': verbose = TRUE; break; case 'i': /*No of trials (in terms of initial hyperplanes) at each node of the decision tree. */ if (oblique == FALSE) mktree_help(); no_of_iterations = atoi (optarg); if (no_of_iterations < 0) mktree_help(); break; case 's': /*Seed for the random number generator */ srand48(atol(optarg)); break; case 'm': /*Maximum number of random perturbations tried when stuck in a local minimum. */ if (oblique == FALSE) mktree_help(); max_no_of_random_perturbations = atoi(optarg); if (max_no_of_random_perturbations < 0) mktree_help(); break; case 'p': prune_portion = atof(optarg); if (prune_portion < 0 || prune_portion >= 1) mktree_help(); break; case 'D': /*The decision tree file. If OC1 is used in the training mode, the decision tree is output to this file. In testing mode, the decision tree is read from this file. */ strcpy(dt_file,optarg); break; case 'M': /*File into which the test data on which the classifier fails is output. */ strcpy(failed_data,optarg); break; case 'l': /*File into which a log of the running of OC1 is written. */ strcpy(log_file,optarg); break; default : mktree_help(); } logfile = fopen(log_file,"w"); if (strlen(train_data)) { if (strlen(test_data) || no_of_folds != 0) read_data(train_data,0); else { i = no_of_train_points; read_data(train_data,i); /*this call loads training, test and prune_test point sets. see the module header for read_data (load_data.c) */ if (unlabeled == TRUE && no_of_test_points) mktree_help(); } if (verbose) { printf("%d training examples loaded from %s.\n", no_of_train_points+no_of_ptest_points,train_data); if (no_of_folds == 0) { printf("%d used for training, %d for pruning.\n", no_of_train_points, no_of_ptest_points); if (no_of_test_points != 0) printf("%d testing examples loaded from %s.\n", no_of_test_points,train_data); } printf("Dimensions = %d, Categories = %d\n", no_of_dimensions,no_of_categories); } initialize(no_of_train_points); if (no_of_folds == 0) /* No cross validation. */ { root = build_dt("\0",train_points,no_of_train_points); if (root != NULL) { root->parent = NULL; if (prune_portion > 0) root = prune(root); if (!strlen(dt_file)) sprintf(dt_file,"%s.dt",train_data); write_tree(root,dt_file); if (prune_portion > 0) printf("Pruned decision tree written to %s.\n",dt_file); else printf("Unpruned decision tree written to %s.\n",dt_file); printf("Leaf Count = %d, Tree Depth = %d\n", leaf_count(root),tree_depth(root)); } else { fprintf(stderr,"No decision tree could be built with the current "); fprintf(stderr,"settings of parameters for this dataset.\n"); print_log(logfile); fclose(logfile); exit(0); } } else { if (no_of_folds == -1) /* leave-one-out cross validation. */ no_of_folds = no_of_train_points; if ( no_of_folds <= 1 || no_of_folds > no_of_train_points) mktree_help(); cross_validate(train_points,no_of_train_points); } deallocate(no_of_train_points); if (no_of_folds != 0) { print_log(logfile); fclose(logfile); exit(1); } } if (strlen(test_data)) { read_data(test_data,-1); if (verbose) printf("%d testing examples loaded from %s.\n", no_of_test_points,test_data); } if (no_of_test_points != 0) { if (root == NULL) { if ((root = read_tree(dt_file)) != NULL) { if (verbose) { print_header(); printf("Decision tree read from %s\n.",dt_file);} } else error("MKTREE: Decision tree couldn't be read."); } if (unlabeled == TRUE) { char out_file[LINESIZE]; FILE *outfile; sprintf(out_file,"%s.classified",test_data); classify(test_points,no_of_test_points,root,out_file); } else { result = classify_and_estimate_accuracy (test_points,no_of_test_points,root); print_classification_result(result); } } print_log(logfile); fclose(logfile); } /************************************************************************/ /* Module name : initialize */ /* Functionality : Allocates space for some global data structures.*/ /* Parameters : no_of_points : size of the training dataset. */ /* Returns : nothing. */ /* Calls modules : vector (util.c) */ /* ivector (util.c) */ /* dvector (util.c) */ /* Is called by modules : main */ /************************************************************************/ initialize(no_of_points) int no_of_points; { no_of_coeffs = no_of_dimensions+1; coeff_array = vector(1,no_of_coeffs); best_coeff_array = vector(1,no_of_coeffs); left_count = ivector(1,no_of_categories); right_count = ivector(1,no_of_categories); inequalities = (struct unidim *)malloc((unsigned)no_of_points * sizeof(struct unidim)); inequalities -= 1; temp_val = dvector(1,no_of_points); } /************************************************************************/ /* Module name : deallocate */ /* Functionality : Frees space allocated to some global data structures.*/ /* Parameters : no_of_points : size of the training dataset. */ /* Returns : nothing. */ /* Calls modules : free_vector (util.c) */ /* free_ivector (util.c) */ /* free_dvector (util.c) */ /* Is called by modules : main */ /************************************************************************/ deallocate(no_of_points) int no_of_points; { free_vector(coeff_array,1,no_of_coeffs); free_ivector(left_count,1,no_of_categories); free_ivector(right_count,1,no_of_categories); free_vector(best_coeff_array,1,no_of_coeffs); if (!free((char *)(inequalities+1))) fprintf(stderr,"Deallocate : Memory deallocation failure. Harmless.\n"); free_dvector(temp_val,1,no_of_points); } /************************************************************************/ /* Module name : build_dt */ /* Functionality : Recursively builds a decision tree. i.e., finds */ /* the best (heuristic) hyperplane separating the */ /* given set of points, and recurses on both sides */ /* of the hyperplane. The best axis-parallel split */ /* is considered before computing oblique splits. */ /* Parameters : node_str : Label to be assigned to the decision tree */ /* node to be created. (see Important Variables used in */ /* this module header). */ /* cur_points : array of pointers to the points under */ /* consideration. */ /* cur_no_of_points : Number of points under consideration */ /* Returns : pointer to the decision tree node created. */ /* NULL, if a node couldn't be created. */ /* Calls modules : set_counts (compute_impurity.c) */ /* compute_impurity (compute_impurity.c) */ /* axis_parallel_split */ /* vector (util.c) */ /* oblique_split */ /* free_vector (util.c) */ /* error (util.c) */ /* find_values (perturb.c) */ /* largest_element (compute_impurity.c) */ /* build_dt */ /* Is called by modules : main */ /* build_dt */ /* cross_validate */ /* Important Variables used : initial_impurity: "inherent" impurity in*/ /* the point set under consideration. ie., */ /* impurity when the separating hyperplane */ /* lies on one side of the point set. */ /* If any amount of perturbations (bounded */ /* by the parametric settings) can not */ /* result in a hyperplane that has a lesser*/ /* impurity than this value, no new tree */ /* node is created. */ /************************************************************************/ struct tree_node *build_dt(node_str,cur_points,cur_no_of_points) char *node_str; POINT **cur_points; int cur_no_of_points; { struct tree_node *cur_node; POINT **lpoints = NULL,**rpoints = NULL; int i,lindex,rindex,lpt,rpt; int largest_element(); float oblique_split(),axis_parallel_split(); float oblique_impurity,axis_parallel_impurity,cur_impurity; float initial_impurity; float *ap_coeff_array; char lnode_str[MAX_DT_DEPTH],rnode_str[MAX_DT_DEPTH]; if (cur_no_of_points <= 1) return(NULL); set_counts(cur_points,cur_no_of_points,0); initial_impurity = compute_impurity(cur_no_of_points); if (initial_impurity == 0) return(NULL); axis_parallel_impurity = axis_parallel_split(cur_points,cur_no_of_points); if (axis_parallel_impurity && oblique) { ap_coeff_array = vector(1,no_of_coeffs); for (i=1;i<=no_of_coeffs;i++) ap_coeff_array[i] = coeff_array[i]; oblique_impurity = oblique_split(cur_points,cur_no_of_points); if (oblique_impurity >= axis_parallel_impurity) { for (i=1;i<=no_of_coeffs;i++) coeff_array[i] = ap_coeff_array[i]; coeff_modified = TRUE; cur_impurity = axis_parallel_impurity; } else cur_impurity = oblique_impurity; free_vector(ap_coeff_array,1,no_of_coeffs); } else cur_impurity = axis_parallel_impurity; if (cur_impurity >= initial_impurity) /*Can not split this node given current parameter settings. */ return(NULL); if (verbose) { if (strlen(node_str)) printf(" \"%s\" hyperplane found.\n",node_str); else printf(" Root hyperplane found.\n"); } cur_node = (struct tree_node *)malloc(sizeof(struct tree_node)); if (cur_node == NULL) error("BUILD_DT : Memory allocation failure."); cur_node->coefficients = vector(1,no_of_coeffs); cur_node->left_count = ivector(1,no_of_categories); cur_node->right_count = ivector(1,no_of_categories); for (i=1;i<=no_of_coeffs;i++) cur_node->coefficients[i] = coeff_array[i]; cur_node->left = cur_node->right = NULL; cur_node->no_of_points = cur_no_of_points; strcpy(cur_node->label,node_str); if (coeff_modified == TRUE) find_values(cur_points,cur_no_of_points); set_counts(cur_points,cur_no_of_points,1); for (i=1;i<=no_of_categories;i++) { cur_node->left_count[i] = left_count[i]; cur_node->right_count[i] = right_count[i]; } cur_node->left_cat = largest_element(left_count,no_of_categories); cur_node->right_cat = largest_element(right_count,no_of_categories); if (cur_impurity != 0) { lpt = rpt = 0; for (i=1;i<=no_of_categories;i++) { lpt += left_count[i]; rpt += right_count[i]; } lpoints = rpoints = NULL; if (left_count[cur_node->left_cat] != lpt) /* There IS impurity in the left partition */ { if ((lpoints = (POINT **) malloc ((unsigned)lpt * sizeof(POINT *))) == NULL) error("BUILD_DT : Memory allocation failure."); lpoints--; } if (right_count[cur_node->right_cat] != rpt) /* There IS impurity in the right partition */ { if ((rpoints = (POINT **) malloc ((unsigned)rpt * sizeof(POINT *))) == NULL) error("BUILD_DT : Memory Allocation Failure."); rpoints--; } lindex=0; rindex=0; for (i=1;i<=cur_no_of_points;i++) { if (cur_points[i]->val < 0 ) { if (lpoints != NULL) lpoints[++lindex] = cur_points[i];} else { if (rpoints != NULL) rpoints[++rindex] = cur_points[i];} } if (lpoints != NULL) { if (lindex != lpt) printf("Build_Dt: Something wrong 1.\n"); strcpy(lnode_str,node_str); strcat(lnode_str,"l"); cur_node->left = build_dt(lnode_str,lpoints,lpt); if (cur_node->left != NULL) (cur_node->left)->parent = cur_node; if (! free((char *)(lpoints+1))) fprintf(stderr,"Build_Dt: Memory deallocation failure. Harmless.\n"); } if (rpoints != NULL) { if (rindex != rpt) printf("Build_Dt: Something wrong 2.\n"); strcpy(rnode_str,node_str); strcat(rnode_str,"r"); cur_node->right = build_dt(rnode_str,rpoints,rpt); if (cur_node->right != NULL) (cur_node->right)->parent = cur_node; if (!free((char *)(rpoints+1))) fprintf(stderr,"Build_Dt: Memory deallocation failure. Harmless.\n"); } } return(cur_node); } /************************************************************************/ /* Module name : oblique_split */ /* Functionality : Attempts to find the hyperplane, at an unrestri-*/ /* cted orientation, that best separates */ /* "cur_points" (minimizing the current impurity */ /* measure), given the current settings of */ /* parameters like no_of_iterations,max_no_of_ */ /* random_perturbations,order_of_perturbation etc. */ /* Parameters : cur_points : array of pointers to the points (samples) */ /* under consideration. */ /* cur_no_of_points : number of points under consideration.*/ /* Returns : the impurity measure of the best hyperplane found. */ /* The hyperplane itself is returned through the global */ /* array "coeff_array". */ /* Calls modules : generate_random_hyperplane */ /* find_values (perturb.c) */ /* set_counts (compute_impurity.c) */ /* compute_impurity (compute_impurity.c) */ /* myrandom (util.c) */ /* suggest_perturbation (perturb.c) */ /* perturb_randomly (perturb.c) */ /* Is called by modules : build_dt */ /************************************************************************/ float oblique_split(cur_points,cur_no_of_points) POINT **cur_points; int cur_no_of_points; { char c; int i,j,old_nsp; int iteration_count = 1; int cur_coeff,improved_in_this_cycle,best_coeff_to_improve; int perturb_randomly(); float cur_error,old_cur_error,best_cur_error,least_error; float x,changeinval; float suggest_perturbation(); /*Starts with the best axis parallel hyperplane. */ find_values(cur_points,cur_no_of_points); set_counts(cur_points,cur_no_of_points,1); least_error = cur_error = compute_impurity(cur_no_of_points); while (least_error != 0.0 && iteration_count <= no_of_iterations) { no_of_stagnant_perturbations = 0; if (order_of_perturbation == RANDOM) { if (cycle_count <= 0) cycle_count = 10*no_of_dimensions; for (i=1;i<=cycle_count;i++) { if (cur_error == 0.0) break; cur_coeff = 0; while (!cur_coeff) cur_coeff = (int)myrandom(1,no_of_coeffs+1); x = suggest_perturbation(cur_points,cur_no_of_points,cur_coeff,&cur_error); if (x != HUGE) { coeff_array[cur_coeff] = x; for (j=1;j<= cur_no_of_points;j++) cur_points[j]->val = temp_val[j]; } else /*Try improving in a random direction*/ { improved_in_this_cycle = FALSE; j = 0; while (cur_error != 0 && !improved_in_this_cycle && ++j<=max_no_of_random_perturbations) improved_in_this_cycle = perturb_randomly(cur_points,cur_no_of_points,&cur_error); } } } else { improved_in_this_cycle = TRUE; cycle_count = 0; while (improved_in_this_cycle) { if (cur_error == 0.0) break; cycle_count++; improved_in_this_cycle = FALSE; if (order_of_perturbation == BEST_FIRST) { old_cur_error = cur_error; best_cur_error = HUGE; best_coeff_to_improve = 1; old_nsp = no_of_stagnant_perturbations; } for (cur_coeff = 1; cur_coeff <= no_of_coeffs;cur_coeff++) { x = suggest_perturbation(cur_points,cur_no_of_points, cur_coeff,&cur_error); if (order_of_perturbation == BEST_FIRST) { if (cur_error < best_cur_error) { best_cur_error = cur_error; best_coeff_to_improve = cur_coeff; } cur_error = old_cur_error; no_of_stagnant_perturbations = old_nsp; if (best_cur_error == 0) break; } else /* Sequential order of perturbation */ if (x != HUGE) { coeff_array[cur_coeff] = x; for (j=1;j<= cur_no_of_points;j++) cur_points[j]->val = temp_val[j]; improved_in_this_cycle = TRUE; if (cur_error == 0) break; } } if (order_of_perturbation == BEST_FIRST && best_cur_error <= cur_error) { cur_coeff = best_coeff_to_improve; x = suggest_perturbation(cur_points,cur_no_of_points, cur_coeff,&cur_error); if (x != HUGE) { coeff_array[cur_coeff] = x; for (j=1;j<= cur_no_of_points;j++) cur_points[j]->val = temp_val[j]; improved_in_this_cycle = TRUE; } } if (cur_error != 0 && !improved_in_this_cycle) /*Try improving in a random direction*/ { i = 0; while (cur_error != 0 && !improved_in_this_cycle && ++i<=max_no_of_random_perturbations) improved_in_this_cycle = perturb_randomly(cur_points,cur_no_of_points,&cur_error); } } } if (cur_error < least_error || (cur_error == least_error && myrandom(0,1) > 0.5)) { least_error = cur_error; for (i=1;i<=no_of_coeffs;i++) best_coeff_array[i] = coeff_array[i]; } iteration_count++; generate_random_hyperplane(coeff_array); coeff_modified = TRUE; find_values(cur_points,cur_no_of_points); set_counts(cur_points,cur_no_of_points,1); cur_error = compute_impurity(cur_no_of_points); } for (i=1;i<=no_of_coeffs;i++) coeff_array[i] = best_coeff_array[i]; coeff_modified = TRUE; find_values(cur_points,cur_no_of_points); return(least_error); } /************************************************************************/ /* Module name : axis_parallel_split */ /* Functionality : Attempts to find the hyperplane, at an axis- */ /* parallel orientation, that best separates */ /* "cur_points" (minimizing the current impurity */ /* measure). */ /* Parameters : cur_points : array of pointers to the points (samples) */ /* under consideration. */ /* cur_no_of_points : number of points under consideration.*/ /* Returns : the impurity measure of the best hyperplane found. */ /* The hyperplane itself is returned through the global */ /* array "coeff_array". */ /* Calls modules : linear_split (perturb.c) */ /* find_values (perturb.c) */ /* set_counts (compute_impurity.c) */ /* compute_impurity (compute_impurity.c) */ /* Is called by modules : build_dt */ /************************************************************************/ float axis_parallel_split(cur_points,cur_no_of_points) POINT **cur_points; int cur_no_of_points; { int i,j,cur_coeff,best_coeff; float cur_error,best_error,best_coeff_split_at; float linear_split(); for (cur_coeff=1;cur_coeff<=no_of_dimensions;cur_coeff++) { for (i=1;i<=no_of_coeffs;i++) coeff_array[i] = 0; coeff_array[cur_coeff] = 1; for (j=1;j<=cur_no_of_points;j++) { inequalities[j].value = cur_points[j]->dimension[cur_coeff]; inequalities[j].cat = cur_points[j]->category; } coeff_array[no_of_coeffs] = -1.0 * (float)linear_split(cur_no_of_points,&cur_error); if (cur_coeff == 1 || cur_error < best_error) { best_coeff = cur_coeff; best_coeff_split_at = coeff_array[no_of_coeffs]; best_error = cur_error; } if (best_error == 0) break; } for (i=1;ival = 0; if (i >= fold_begin && i <= fold_end) test_points[++no_of_test_points] = points[i]; else { if (no_of_ptest_points < prune_count) ptest_points[++no_of_ptest_points] = points[i]; else train_points[++no_of_train_points] = points[i]; } } if (verbose) printf("Test on points %d - %d; Train & prune on the rest: \n", fold_begin,fold_end); root = build_dt("\0",train_points,no_of_train_points); if (root != NULL) { root->parent = NULL; if (prune_portion > 0) root = prune(root); if (fold_begin == 1) { if (!strlen(dt_file)) sprintf(dt_file,"%s.dt",train_data); write_tree(root,dt_file); if (prune_portion > 0) printf("Pruned decision tree 1 written to %s.\n",dt_file); else printf("Unpruned decision tree 1 written to %s.\n",dt_file); } } i = 1 + (fold_begin -1)/fold_size; results[i] = classify_and_estimate_accuracy(test_points,no_of_test_points,root); if (fold_end == no_of_points) break; else fold_begin = fold_end+1; } if (!free((char *)(test_points+1))) fprintf(stderr,"Cross_Validate: Memory deallocation failure. Harmless.\n"); if (!free((char *)(train_points+1))) fprintf(stderr,"Cross_Validate: Memory deallocation failure. Harmless. \n"); resultsum.leaf_count = resultsum.tree_depth = resultsum.accuracy = 0; resultsum.class = ivector(1,2*no_of_categories); for (i=1;i<=2 * no_of_categories;i++) resultsum.class[i] = 0; for (i=1;i<=no_of_folds;i++) { printf("Fold %d: LC = %.0f TD = %.0f, Acc = %.2f\n", i,results[i].leaf_count,results[i].tree_depth,results[i].accuracy); resultsum.leaf_count += results[i].leaf_count; resultsum.tree_depth += results[i].tree_depth; for (j=1;j<= 2 * no_of_categories;j++) resultsum.class[j] += results[i].class[j]; } resultsum.leaf_count /= no_of_folds; resultsum.tree_depth /= no_of_folds; for (i=1;i<= no_of_categories;i++) no_of_correctly_classified_points += resultsum.class[2*i-1]; resultsum.accuracy = 100.0 * no_of_correctly_classified_points / no_of_points; printf("\nUsing %d-fold cross validation:\n",no_of_folds); print_classification_result(resultsum); mean = resultsum.leaf_count; variance = 0; for (i=1;i<=no_of_folds;i++) variance += (results[i].leaf_count - mean) * (results[i].leaf_count - mean); printf("Standard deviation of leaf counts = %.3f\n", (float)sqrt((double)(variance))); mean = resultsum.accuracy; variance = 0; for (i=1;i<=no_of_folds;i++) variance += (results[i].accuracy - mean) * (results[i].accuracy - mean); printf("Standard deviation of accuracy = %.3f\n\n", (float)sqrt((double)(variance))); } /************************************************************************/ /* Module name : print_classification_result */ /* Functionality : prints the result of a classification ! */ /* Parameters : result : a structure giving details of the decision */ /* tree used for classification, and the accuracies */ /* obtained. */ /* Returns : nothing. */ /* Calls modules : none. */ /* Is called by modules : main */ /************************************************************************/ print_classification_result(result) struct test_outcome result; { int i; float j; printf("\nClassification accuracy = %.4f\n",result.accuracy); printf("Leaf count = %.1f Tree Depth = %.1f ", result.leaf_count,result.tree_depth); if (prune_portion > 0) printf ("(With pruning)\n"); else printf ("(Without pruning)\n"); for (i=1;i<=no_of_categories;i++) if (result.class[2*i] != 0) { j = 100.0 * result.class[2*i-1]/result.class[2*i]; printf("Class %d : Accuracy = %.3f (%d/%d)\n", i,j,result.class[2*i-1],result.class[2*i]); } printf("\n"); } /************************************************************************/ /* Module name : print_log */ /* Functionality : prints the log of a run of OC1 into the user- */ /* specified "log_file" (default : oc1.log). Log */ /* mainly consists of the parameter settings. */ /* Parameters : logfile : output File pointer. */ /* Returns : Nothing. */ /* Calls modules : none. */ /* Is called by modules : main */ /************************************************************************/ print_log(logfile) FILE *logfile; { if (strlen(train_data)) { fprintf(logfile,"Training data : %s\n",train_data); } if (no_of_folds) fprintf(logfile,"%d-fold Cross Validation used to estimate accuracy.\n", no_of_folds); else if (strlen(test_data)) fprintf(logfile,"Testing data : %s\n",test_data); fprintf(logfile,"Data is %d-dimensional, having %d classes.\n", no_of_dimensions,no_of_categories); if (oblique == FALSE) fprintf(logfile,"Only axis-parallel splits considered at each node.\n"); else { fprintf(logfile,"Parameters for finding oblique splits at each node :\n"); fprintf(logfile,"\tNumber of iterations = %d\n",no_of_iterations); if (order_of_perturbation == BEST_FIRST) fprintf(logfile,"\tOrder of coefficient perturbation = Best First\n"); else if (order_of_perturbation == RANDOM) fprintf(logfile,"\tOrder of coefficient perturbation = Random %d\n",cycle_count); else fprintf(logfile,"\tOrder of coefficient perturbation = Sequential\n"); fprintf(logfile,"\tMaximum number of random perturbations tried at each "); fprintf(logfile,"local minimum = %d\n",max_no_of_random_perturbations); } if (strlen(train_data)) { if (no_of_folds == 0) fprintf(logfile,"Decision tree written to %s.\n",dt_file); } else if (strlen(test_data)) fprintf(logfile,"Decision tree read from %s.\n",dt_file); if (strlen(failed_data) && no_of_folds != 0) fprintf(logfile,"Misclassified points written to %s.\n",failed_data); fprintf(logfile,"\n"); } /************************************************************************/ /* Module name : read_data */ /* Functionality : Acts as a front-end to load_points, which is */ /* the module that actually loads points. */ /* Sets the global variables no_of_train_points, */ /* no_of_test_points and no_of_ptest_points. */ /* Parameters : input_file : File name from which points are loaded. */ /* no_of_points: number of points to be loaded. */ /* 0 : all points are in the training set. */ /* -1: all points are in the testing set. */ /* n(>0): n randomly-chosen points */ /* comprise the training set and */ /* the rest the testing set. */ /* Returns : Nothing. */ /* Calls modules : error (util.c) */ /* load_points (load_data.c) */ /* allocate_point_array (load_data.c) */ /* Is called by modules : main */ /************************************************************************/ read_data(input_file,no_of_points) char *input_file; int no_of_points; { FILE *infile; int i,j,k,count,load_points(); POINT **points,**allocate_point_array(); if (strlen(input_file) == 0 ) error("READ_DATA : No data filename specified."); if (no_of_points < -1) error("READ_DATA : Invalid number of points to be loaded."); if ((infile = fopen(input_file,"r")) == NULL) error("READ_DATA : Data file can not be opened."); count = load_points(infile,&points); if (no_of_points != -1) shuffle_points (points,count); fclose(infile); if (no_of_points > count) error("READ_DATA : Insufficient data in input file."); if (no_of_points == 0 || no_of_points == count) { no_of_test_points = 0; if (no_of_folds != 0) no_of_ptest_points = 0; else no_of_ptest_points = (int)(count * prune_portion); no_of_train_points = count - no_of_ptest_points; } else if (no_of_points == -1) { no_of_test_points = count; no_of_ptest_points = 0; no_of_train_points = 0; } else { no_of_test_points = count - no_of_points; no_of_ptest_points = (int)(no_of_points * prune_portion); no_of_train_points = no_of_points - no_of_ptest_points; } if (no_of_train_points) { train_points = allocate_point_array(train_points,no_of_train_points,0); for (i=1;i<=no_of_train_points;i++) { for (j=1;j<=no_of_dimensions;j++) train_points[i]->dimension[j] = points[i]->dimension[j]; train_points[i]->category = points[i]->category; train_points[i]->val = points[i]->val; } } if (no_of_ptest_points) { ptest_points = allocate_point_array(ptest_points,no_of_ptest_points,0); for (i=no_of_train_points+1;i<=no_of_train_points + no_of_ptest_points;i++) { k = i - no_of_train_points; for (j=1;j<=no_of_dimensions;j++) ptest_points[k]->dimension[j] = points[i]->dimension[j]; ptest_points[k]->category = points[i]->category; ptest_points[k]->val = points[i]->val; } } if (no_of_test_points) { test_points = allocate_point_array(test_points,no_of_test_points,0); for (i=no_of_train_points+no_of_ptest_points+1; i<=count;i++) { k = i - no_of_train_points - no_of_ptest_points; for (j=1;j<=no_of_dimensions;j++) test_points[k]->dimension[j] = points[i]->dimension[j]; test_points[k]->category = points[i]->category; test_points[k]->val = points[i]->val; } } for (i=1;i<=count;i++) { free_vector(points[i]->dimension,1,no_of_dimensions); if (!free((char *)points[i])) fprintf(stderr,"Read_Data: Memory deallocation failure. Harmless. \n"); } if (!free((char *)(points+1))) fprintf(stderr,"Read_Data: Memory deallocation failure. Harmless. \n"); } /************************************************************************/ /* Module name : mktree_help */ /* Functionality : Displays a help screen when the user makes a */ /* mistake in mktree options. */ /* Parameters : None. */ /* Returns : Nothing. */ /* Calls modules : None. */ /* Is called by modules : main */ /************************************************************************/ mktree_help() { printf ("\n\nUsage : mktree -t:n:T:V:d:c:abr:vi:s:m:p:D:M:l:"); printf ("\nOptions :"); printf ("\n -t"); printf ("\n -n (Default=All)"); printf ("\n -T"); printf ("\n -V<#partitions for cross validation> (Default=0)"); printf ("\n (-1 : leave-one-out, 0 = no CV)"); printf ("\n -d<#dimensions> "); printf ("\n (Default: As given by the data or decision tree file)"); printf ("\n -c<#categories."); printf ("\n (Default: As given by the data or decision tree file)"); printf ("\n -a : Only axis parallel splits (Default=Off)"); printf ("\n -b : Order of coeff perturbation= Best First"); printf ("\n (Default=Sequential)"); printf ("\n -r<#coeff perturbations if order = Random> "); printf ("\n (Default=10 * no_of_dimensions)"); printf ("\n -v : Verbose"); printf ("\n -i<#restarts for the perturbation alg. at each node>"); printf ("\n (Default=20)"); printf ("\n -s"); printf ("\n -m (Default = 10)"); printf ("\n -p"); printf ("\n (Default=0.20)"); printf ("\n -D"); printf ("\n (If DT is to be output, Default=.dt)"); printf ("\n -M"); printf ("\n (Default = No listing)"); printf ("\n -l (Default=oc1.log)"); printf ("\n\n"); exit(0); } /************************************************************************/ /************************************************************************/