/****************************************************************/ /* Copyright 1993 : Johns Hopkins University */ /* Department of Computer Science */ /****************************************************************/ /* Contact : murthy@cs.jhu.edu */ /****************************************************************/ /* File Name : impurity_measures.c */ /* Author : Sreerama K. Murthy */ /* Last modified : October 1993 */ /* Contains modules : maxminority */ /* summinority */ /* sum_of_impurities */ /* info_gain */ /* gini_index */ /* twoing */ /* Uses modules in : compute_impurity.c */ /* oc1.h */ /* Is used by modules in : compute_impurity.c */ /* Remarks : If the user wants to "plug-in" a new */ /* impurity measure, that function should */ /* be written on the lines of the functions*/ /* in this file. It is also advisable that */ /* such new functions be concatenated to */ /* this file, so that the user needn't */ /* bother with the global declarations and */ /* changing the makefile. */ /****************************************************************/ #include "oc1.h" extern int no_of_dimensions; extern int *left_count,*right_count; extern int no_of_categories; int largest_element(); /************************************************************************/ /* Module name : maxminority */ /* Functionality : Computes the impurity of the current hypeplane */ /* location. */ /* Parameters : none. */ /* Returns : larger of the minority values on the left and right. */ /* (minority on a side = sum of the counts of all classes */ /* except the class with the highest count.) */ /* Calls modules : largest_element (compute_impurity.c) */ /* Is called by modules : compute_impurity (compute_impurity.c) */ /* Important Variables used : left_count,right_count : arrays holding */ /* the number of points of each category on*/ /* the left and right of the hyperplane */ /* under consideration, respectively. */ /* These implicitly define the location of */ /* the hyperplane. */ /* Remarks : It is assumed that "left_count" and "right_count" are */ /* set properly before this module is invoked. */ /************************************************************************/ float maxminority() { int i,j,lminor=0,rminor=0; i = largest_element(left_count,no_of_categories); if (i <= no_of_categories) { for (j=1;j<=no_of_categories ;j++) if (i != j) lminor += left_count[j]; } i = largest_element(right_count,no_of_categories); if (i <= no_of_categories) { for (j=1;j<=no_of_categories ;j++) if (i != j) rminor += right_count[j]; } if (lminor > rminor) return((float)lminor); else return((float)rminor); } /************************************************************************/ /* Module name : summinority */ /* Functionality : Computes the impurity of the current hypeplane */ /* location. */ /* Parameters : none. */ /* Returns : sum of the minority values on the left and right. */ /* (minority on a side = sum of the counts of all classes */ /* except the class with the highest count.) */ /* Calls modules : largest_element (compute_impurity.c) */ /* Is called by modules : compute_impurity (compute_impurity.c) */ /* Important Variables used : left_count,right_count : arrays holding */ /* the number of points of each category on*/ /* the left and right of the hyperplane */ /* under consideration, respectively. */ /* These implicitly define the location of */ /* the hyperplane. */ /* Remarks : It is assumed that "left_count" and "right_count" are */ /* set properly before this module is invoked. */ /************************************************************************/ float summinority() { int i,j,lminor=0,rminor=0; i = largest_element(left_count,no_of_categories); if (i <= no_of_categories) { for (j=1;j<=no_of_categories ;j++) if (i != j) lminor += left_count[j]; } i = largest_element(right_count,no_of_categories); if (i <= no_of_categories) { for (j=1;j<=no_of_categories ;j++) if (i != j) rminor += right_count[j]; } return((float)(lminor+rminor)); } /************************************************************************/ /* Module name : sum_of_impurities */ /* Functionality : Computes the impurity of the current hypeplane */ /* location. */ /* Parameters : none. */ /* Returns : sum of the impurity values on the left and right. */ /* Impurity on a side = sigma ((xi - avg)*(xi - avg)) */ /* i=1..k */ /* where x1,..xk are the classes (categories) of the */ /* points on that side of the hyperplane, and */ /* avg = (x1+x2+..+xk)/k. */ /* Calls modules : none. */ /* Is called by modules : compute_impurity (compute_impurity.c) */ /* Important Variables used : left_count,right_count : arrays holding */ /* the number of points of each category on*/ /* the left and right of the hyperplane */ /* under consideration, respectively. */ /* These implicitly define the location of */ /* the hyperplane. */ /* Remarks : It is assumed that "left_count" and "right_count" are */ /* set properly before this module is invoked. */ /************************************************************************/ float sum_of_impurities() { float lavg=0,ravg=0,lerror = 0,rerror = 0; int i,lsum1=0,rsum1=0,lsum2=0,rsum2=0; int *temp1=NULL,*temp2=NULL; static int si_compare(); if (no_of_categories > 2) /* Renumber categories in descending order of their proportion of occurance. This removes the possibility for a biased impurity estimate. */ { temp1 = ivector(1,no_of_categories); temp2 = ivector(1,no_of_categories); for (i=1;i<=no_of_categories;i++) { temp1[i] = left_count[i]; temp2[i] = right_count[i]; } qsort((char *)(left_count+1),no_of_categories,sizeof(int),si_compare); qsort((char *)(right_count+1),no_of_categories,sizeof(int),si_compare); } for (i=1;i<=no_of_categories;i++) { lsum1 += left_count[i]; lsum2 += i * left_count[i]; rsum1 += right_count[i]; rsum2 += i * right_count[i]; } if (lsum1 != 0) lavg = (float)lsum2/lsum1; if (rsum1 != 0) ravg = (float)rsum2/rsum1; for (i=1;i<=no_of_categories;i++) { lerror += left_count[i] * (i - lavg) * (i - lavg); rerror += right_count[i] * (i - ravg) * (i - ravg); } if (no_of_categories > 2) /* Restore original left_count and right_count arrays. Remember, they are read_only. */ { for (i=1;i<=no_of_categories;i++) { left_count[i] = temp1[i]; right_count[i] = temp2[i]; } free_ivector(temp1,1,no_of_categories); free_ivector(temp2,1,no_of_categories); } return ((float)(lerror+rerror)); } /************************************************************************/ static int si_compare(p1,p2) int *p1,*p2; { int p; p = *p1-*p2; return(p); } /************************************************************************/ /* Module name : info_gain */ /* Functionality : Computes the (Quinlan's) information gain of */ /* the current split. As OC1 tries to minimize the */ /* "impurity" instead of maximizing the information*/ /* "gain", this module returns the reciprocal of */ /* the computed gain. */ /* Parameters : None. */ /* Returns : Reciprocal of the information gain. */ /* Calls modules : None. */ /* Is called by modules : compute_impurity (compute_impurity.c) */ /* Important Variables used : left_count,right_count : arrays holding */ /* the number of points of each category on*/ /* the left and right of the hyperplane */ /* under consideration, respectively. */ /* These implicitly define the location of */ /* the hyperplane. */ /* Remarks : It is assumed that "left_count" and "right_count" are */ /* set properly before this module is invoked. */ /************************************************************************/ float info_gain() { float presplit_info=0,postsplit_info=0,left_info=0,right_info=0; float ratio,infogain; int i,total_count=0,total_left_count=0,total_right_count=0; double log2(); for (i = 1;i<=no_of_categories;i++) { total_left_count += left_count[i]; total_right_count += right_count[i]; } total_count = total_left_count + total_right_count; if (total_count) for (i = 1;i<=no_of_categories;i++) { ratio = (float)(left_count[i]+right_count[i])/total_count; if (ratio) presplit_info += -1.0 * ratio * (float)log2((double)ratio); } if (total_left_count) { for (i = 1;i<=no_of_categories;i++) { ratio = (float)left_count[i]/total_left_count; if (ratio) left_info += -1.0 * ratio * (float)log2((double)ratio); } postsplit_info += total_left_count * left_info / total_count; } if (total_right_count) { for (i = 1;i<=no_of_categories;i++) { ratio = (float)right_count[i]/total_right_count; if (ratio) right_info += -1.0 * ratio * (float)log2((double)ratio); } postsplit_info += total_right_count * right_info / total_count; } infogain = presplit_info - postsplit_info; if (infogain == 0) /*No information gained due to this split. i.e., Either the region is homogenous or impurity is as large as it can be. */ { for (i=1;i<=no_of_categories;i++) if (left_count[i] + right_count[i] == total_count) return(0); return(HUGE); } else return(1.0/infogain); } /************************************************************************/ /* Module name : gini_index */ /* Functionality : Computes the gini_index of a node. This stati- */ /* stical measure was described in the context of */ /* decision trees by Leo Breiman et al in the CART */ /* book. */ /* Parameters : None. */ /* Returns : A floating point number, representing the impurity of */ /* a node. */ /* Calls modules : None. */ /* Is called by modules : compute_impurity (compute_impurity.c) */ /************************************************************************/ float gini_index() { float total_left_count=0,total_right_count=0; float total_left_count2,total_right_count2; float gini_left=0,gini_right=0; int i,j; for (i=1;i<=no_of_categories;i++) { total_left_count += left_count[i]; total_right_count += right_count[i]; } total_left_count2 = total_left_count * total_left_count; total_right_count2 = total_right_count * total_right_count; if (total_left_count2) for (i=1;i