/* Copyright (c) 1993 by The Johns Hopkins University */ /* PEBLS: Exemplar-Based Learning System For more information, contact: Steven Salzberg (salzberg@cs.jhu.edu) Dept. of Computer Science Johns Hopkins University Baltimore, MD 21210 */ /* PEBLS.H: DEFINED CONSTANTS AND TYPES */ /* ============================================================ */ /* DEFINED CONSTANTS (DO NOT MODIFY) */ /* Basic Constants */ #define ON 1 #define OFF 0 #define TRUE 1 #define FALSE 0 #define INFINITY 99999.9 #define UNKNOWN -1 #define CHOOSE_ANY -1 #define NOOP -1 #define PCOUNT 0 /* Output flags */ #define NCOUNT 1 #define UCOUNT 2 #define OCOUNT 3 /* PEBLS Operating Modes */ #define TEST 11 #define CLASSIFY 12 /* Data Formats */ #define STANDARD 13 #define SUBUNITS 14 /* Train & Test Modes */ #define SUBSET 15 #define SPECIFIED_GROUP 16 #define LEAVE_ONE_OUT 17 /* Output modes */ #define AVERAGES_ONLY 18 #define DETAILED 19 #define COMPLETE 20 /* Nearest neighbor voting */ #define MAJORITY 60 #define WEIGHTED_DISTANCE 61 #define THRESHOLD 62 /* Exemplar weighting methods */ #define USED_CORRECT 70 #define ONE_PASS 71 #define INCREMENT 72 #define USER_EXEMPLAR_1 73 #define USER_EXEMPLAR_2 74 #define USER_EXEMPLAR_3 75 /* Feature Weight Standards */ #define TRIANGLE 80 #define GENETIC 81 #define USER_DEFINED 82 #define USER_FEATURE_1 83 #define USER_FEATURE_2 84 #define USER_FEATURE_3 85 #define PROTEIN_STANDARD 90 /* Post processing methods */ #define PROTEIN_SMOOTH 91 #define PROTEIN_SMOOTH_ONLY 92 #define USER_POSTPROC_1 95 #define USER_POSTPROC_2 96 #define USER_POSTPROC_3 97 /* ERROR CODES */ #define OP_MODE_ERR 101 #define USAGE_ERR 102 #define TRAIN_SIZE_ERR 103 #define TRIALS_ERR 104 #define CLASSES_ERR 105 #define INSTANCES_ERR 106 #define NO_TRAIN_ERR 107 #define UNK_CONST_ERR 108 #define UNK_EWEIGHT_ERR 109 #define UNK_FWEIGHT_ERR 110 #define UNK_POSTPROC_ERR 111 #define K_NEIGHBOR_ERR 112 #define K_NEIGHBOR2_ERR 113 #define GENETIC_ERR 114 #define DATAFILE_ERR 115 #define UNDECLARED_VALUE_ERR 116 #define UNDECLARED_CLASS_ERR 117 #define UNK_PARAMETER_ERR 118 #define VOTING_ERR 119 #define FEATURES_ERR 120 #define VALUES_ERR 121 #define FEATURE_VALUE_ERR 122 #define FEATURE_WEIGHT_ERR 123 /* ============================================================ */ /* DEFINED TYPES */ typedef struct /* INSTANCE TYPE */ { char id[ID_LENGTH_MAX]; /* Instance ID */ int value[FEATURES_MAX]; /* Feature values */ int class_true; /* True class */ int class_nearest; /* Class of nearest neighbor */ int class_pp; /* Class after post processing */ float weight; /* Weighting parameters */ int correct; int used; int weighted; int classify_errors; /* # times instances misclassified over N trials */ int trained; /* Trained flag */ int offset; /* Offset of instance (within subunit type) */ } instance_type; typedef struct /* SUBUNIT TYPE */ { char id[ID_LENGTH_MAX]; int value[SUBUNIT_LENGTH_MAX]; int class[SUBUNIT_LENGTH_MAX]; } subunit_type; /* HASH TABLE NODES */ struct node_entry { char symbol[ID_LENGTH_MAX]; int value; struct node_entry *next; }; typedef struct node_entry node; typedef struct { int p,n,o,u; } output_type; /* CONFIGURATION INFORMATION TYPE */ typedef struct { int operating_mode; char data_file[50]; int data_format; int classes; int class_name[CLASSES_MAX][ID_LENGTH_MAX]; int features; float feature_weights[FEATURES_MAX]; int nvalues[FEATURES_MAX]; int value_spacing; int values; int common_values; int training_mode; int instances; int training_instances; int test_instances; float training_size; int post_processing; int smooth_window; int threshold[CLASSES_MAX]; int precedence[CLASSES_MAX]; int nearest_neighbor; int nearest_voting; int exemplar_weighting; int feature_weighting; float K; float R; int genetic_count; float genetic_adj; int trials; int output_mode; int debug; } config_type; /* FUNCTION PROTOTYPES */ void protein_post_process(void); int nearest_majority_vote(int k, int nearest[]); int nearest_weighted_distance_vote(int k, int nearest[], float distances[]); void update_nearest_list(int i, float dist, int k, int nearest[], float distances[]); void nearest_neighbor(int example, int k, int nearest_list[], float distances[], int weighting); void train_instance(int); void leave_one_out(void); void train_subset(); void train_specified_group(void); void train(void); void test(void); void initialize_data(void); void initialize_configuration(void); void print_configuration(void); void check_configuration(void); int constant_translate(char *); void process_configuration_entry(char line[]); void read_configuration_file(char filename[]); void initialize(char filename[]); void print_count(void); void initialize_training(void); void print_distance_tables(void); float dtable_entry(int f, int v1, int v2); float MVDM(int x, int y, int weighting); void build_distance_tables(void); void set_feature_weights(int shape); float W(int i); void print_exemplar_weights(void); void exemplar_weights_used_correct(void); void exemplar_weights_one_pass(void); void set_exemplar_weights(void); int hash(char symbol[]); void initialize_classtab(void); int classtab_lookup(char filename[]); void classtab_insert(char class_name[], int value); void initialize_symtab(void); int symtab_lookup(int feature, char symbol[]); void symtab_insert(int feature_index, char symbol[], int value); void standard_reader(void); void subunit_to_instances(subunit_type *subunit, int length, int training); void subunit_reader(void); float f_random(float max); int i_random(int max); void DEBUG(int i); int round(float x); float corr_coeff(double p, double n, double u, double o); void print_instances(void); void shuffle(int arr[], int tinst); void error(int code, char *string); void initialize_output(void); void compute_output_totals(void); float get_average(int class, int count); void print_averages(void); void print_output(void); void print_classification_results(void); void update_single_output(int instance, int nearest_class, int trial); void update_output(int trial);