/*************************************************************************/ /* */ /* Evaluation of the subsetting of a discrete attribute */ /* ---------------------------------------------------- */ /* */ /*************************************************************************/ #include "buildex.i" ItemCount *Slice1, /* Slice1[c] = saved values of Freq[x][c] in subset.c */ *Slice2; /* Slice2[c] = saved values of Freq[y][c] */ Set **Subset; /* Subset[a][s] = subset s for att a */ short *Subsets; /* Subsets[a] = no. subsets for att a */ /*************************************************************************/ /* */ /* Evaluate subsetting a discrete attribute and form the chosen */ /* subsets Subset[Att][], setting Subsets[Att] to the number of */ /* subsets, and the Info[] and Gain[] of a test on the attribute */ /* */ /*************************************************************************/ EvalSubset(Att, Fp, Lp, Items) /* ---------- */ Attribute Att; ItemNo Fp, Lp; ItemCount Items; { DiscrValue V1, V2, BestV1, BestV2, Barred; ItemCount KnownItems; ClassNo c; float BaseInfo, MinGain, ThisGain, ThisInfo, Val, BestVal, BestGain, BestInfo, PrevVal, PrevGain, PrevInfo, DiscrKnownBaseInfo(), Worth(), ComputeGain(), TotalInfo(); short Blocks=0, MissingValues=0, ReasonableSubsets, Bytes, b; Boolean MergedSubsets = false; int SaveMINOBJS; SaveMINOBJS = MINOBJS; MINOBJS = 1; /* First compute Freq[][], ValFreq[], base info, and the gain and total info of a split on discrete attribute Att */ ComputeFrequencies(Att, Fp, Lp); KnownItems = Items - ValFreq[0]; if ( KnownItems < Epsilon ) { Verbosity(2) printf("\tAtt %s: no known values\n", AttName[Att]); Gain[Att] = -Epsilon; Info[Att] = 0; return; } BaseInfo = DiscrKnownBaseInfo(KnownItems, MaxAttVal[Att]); PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], MaxAttVal[Att],KnownItems); PrevInfo = TotalInfo(ValFreq, 0, MaxAttVal[Att]) / Items; PrevVal = Worth(PrevInfo, PrevGain, Epsilon); Verbosity(2) { printf("\tAtt %s", AttName[Att]); Verbosity(3) PrintDistribution(Att, MaxAttVal[Att], true); printf("\tinf %.3f, gain %.3f, val=%.3f\n", PrevInfo, PrevGain, PrevVal); } /* Eliminate unrepresented attribute values from Freq[] and ValFreq[] and form a separate subset for each represented attribute value */ Bytes = (MaxAttVal[Att]>>3) + 1; ClearBits(Bytes, Subset[Att][0]); ForEach(V1, 1, MaxAttVal[Att]) { if ( ValFreq[V1] > 0.5 ) { if ( ++Blocks < V1 ) { ValFreq[Blocks] = ValFreq[V1]; ForEach(c, 0, MaxClass) { Freq[Blocks][c] = Freq[V1][c]; } } ClearBits(Bytes, Subset[Att][Blocks]); SetBit(V1, Subset[Att][Blocks]); } else { SetBit(V1, Subset[Att][0]); MissingValues++; } } /* Merge any single-class subsets with others of the same class */ /* Note: have ValFreq[V] > 0 for all V */ ForEach(V1, 1, Blocks-1) { for ( c = 0 ; Freq[V1][c] < 0.1 ; c++ ) ; if ( Freq[V1][c] < ValFreq[V1] - 0.1 ) continue; /* Now have a single class -- look for others */ for ( V2 = V1+1 ; V2 <= Blocks ; ) { if ( Freq[V2][c] < ValFreq[V2] - 0.1 ) { V2++; } else { /* Merge these subsets */ Combine(V1, V2, Blocks); ForEach(b, 0, Bytes-1) { Subset[Att][V1][b] |= Subset[Att][V2][b]; Subset[Att][V2][b] = Subset[Att][Blocks][b]; } Blocks--; MergedSubsets = true; } } } if ( MergedSubsets ) { PrevGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems); PrevInfo = TotalInfo(ValFreq, 0, Blocks) / Items; PrevVal = Worth(PrevInfo, PrevGain, Epsilon); Verbosity(2) { printf("\tAfter merging single-class subsets:"); Verbosity(3) PrintDistribution(Att, Blocks, false); printf("\tinf %.3f, gain %.3f, val=%.3f\n", PrevInfo, PrevGain, PrevVal); } } /* Examine possible pair mergers and hill-climb */ MinGain = PrevGain / 2; while ( Blocks > 2 ) { BestVal = BestV1 = 0; BestGain = -Epsilon; /* Check reasonable subsets; if less than 3, bar mergers involving the largest block */ ReasonableSubsets = 0; Barred = 1; ForEach(V1, 1, Blocks) { if ( ValFreq[V1] >= SaveMINOBJS ) ReasonableSubsets++; if ( ValFreq[V1] > ValFreq[Barred] ) Barred = V1; } if ( ReasonableSubsets >= 3 ) Barred = 0; /* For each possible pair of values, calculate the gain and total info of a split in which they are treated as one. Keep track of the pair with the best gain. */ ForEach(V1, 1, Blocks-1) { ForEach(V2, V1+1, Blocks) { if ( V1 == Barred || V2 == Barred ) continue; Combine(V1, V2, Blocks); ThisGain = ComputeGain(BaseInfo, UnknownRate[Att], Blocks-1, KnownItems); ThisInfo = TotalInfo(ValFreq, 0, Blocks-1) / Items; Val = Worth(ThisInfo, ThisGain, Epsilon); Verbosity(4) { printf("\tcombine %d %d info %.3f gain %.3f val %.3f", V1, V2, ThisInfo, ThisGain, Val); PrintDistribution(Att, Blocks-1, false); } /* Force a split if less than two reasonable subsets, or using GAIN criterion Prefer this split to the previous one if gain >= MinGain (and previous < MinGain), or val >= previous best val */ if ( ThisGain >= MinGain && BestGain < MinGain || Val >= BestVal || ! BestV1 && ( ! GAINRATIO || ReasonableSubsets < 2 ) ) { BestVal = Val; BestGain = ThisGain; BestInfo = ThisInfo; BestV1 = V1; BestV2 = V2; } Uncombine(V1, V2); } } if ( GAINRATIO && ReasonableSubsets >= 2 && ( ! BestV1 || BestVal < PrevVal + 1E-5 || BestVal == PrevVal && BestGain < PrevGain ) ) break; PrevGain = BestGain; PrevInfo = BestInfo; PrevVal = BestVal; Combine(BestV1, BestV2, Blocks); ForEach(b, 0, Bytes-1) { Subset[Att][BestV1][b] |= Subset[Att][BestV2][b]; Subset[Att][BestV2][b] = Subset[Att][Blocks][b]; } Blocks--; Verbosity(2) { printf("\t\tform subset "); PrintSubset(Att, Subset[Att][BestV1]); printf(": %d subsets, inf %.3f, gain %.3f, val %.3f\n", Blocks, BestInfo, BestGain, BestVal); Verbosity(3) { printf("\t\tcombine %d, %d", BestV1, BestV2); PrintDistribution(Att, Blocks, false); } } } MINOBJS = SaveMINOBJS; if ( PrevVal <= 0 ) { Gain[Att] = -Epsilon; Info[Att] = 0; } else { Gain[Att] = ComputeGain(BaseInfo, UnknownRate[Att], Blocks, KnownItems); Info[Att] = PrevInfo; if ( MissingValues ) { Blocks++; CopyBits(Bytes, Subset[Att][0], Subset[Att][Blocks]); } Subsets[Att] = Blocks; Verbosity(2) printf("\tFinal subsets:"); Verbosity(3) PrintDistribution(Att, Blocks, false); Verbosity(2) printf("\tinf %.3f gain %.3f val %.3f\n", Info[Att], Gain[Att], Worth(Info[Att], Gain[Att], Epsilon)); } } /*************************************************************************/ /* */ /* Combine the distribution figures of discrete attribute values */ /* x and y, putting the combined figures in Freq[x][] and */ /* ValFreq[x][], and saving old values in Slice1 and Slice2 */ /* */ /*************************************************************************/ Combine(x, y, Last) /* ------- */ DiscrValue x, y, Last; { ClassNo c; ForEach(c, 0, MaxClass) { Slice1[c] = Freq[x][c]; Slice2[c] = Freq[y][c]; Freq[x][c] += Freq[y][c]; Freq[y][c] = Freq[Last][c]; } Slice1[MaxClass+1] = ValFreq[x]; Slice2[MaxClass+1] = ValFreq[y]; ValFreq[x] += ValFreq[y]; ValFreq[y] = ValFreq[Last]; } /*************************************************************************/ /* */ /* Restore old class distribution figures of discrete attribute */ /* values x and y from Slice1 and Slice2 */ /* */ /*************************************************************************/ Uncombine(x, y) /* --------- */ DiscrValue x, y; { ClassNo c; ForEach(c, 0, MaxClass) { Freq[x][c] = Slice1[c]; Freq[y][c] = Slice2[c]; } ValFreq[x] = Slice1[MaxClass+1]; ValFreq[y] = Slice2[MaxClass+1]; } /*************************************************************************/ /* */ /* Print the values of attribute Att which are in the subset Ss */ /* */ /*************************************************************************/ PrintSubset(Att, Ss) /* ----------- */ Attribute Att; Set Ss; { DiscrValue V1; Boolean First=true; ForEach(V1, 1, MaxAttVal[Att]) { if ( In(V1, Ss) ) { if ( First ) { First = false; } else { printf(", "); } printf("%s", AttValName[Att][V1]); } } } /*************************************************************************/ /* */ /* Construct and return a node for a test on a subset of values */ /* */ /*************************************************************************/ SubsetTest(Node, Att) /* ----------- */ Tree Node; Attribute Att; { ItemCount CountItems(); short S, Bytes; Sprout(Node, Subsets[Att]); Node->NodeType = BrSubset; Node->Tested = Att; Node->Errors = 0; Bytes = (MaxAttVal[Att]>>3) + 1; Node->Subset = (Set *) calloc(Subsets[Att] + 1, sizeof(Set)); ForEach(S, 1, Node->Forks) { Node->Subset[S] = (Set) malloc(Bytes); CopyBits(Bytes, Subset[Att][S], Node->Subset[S]); } }