;;; -*- Mode:Common-Lisp; Package:USER; Base:10 -*- ;;;; Standard data file: A standard example data file (possibly with a theory) sets a set of standard ;;;; global variables. The UNIVERSAL-TESTER only crucially relies on *RAW-EXAMPLES* but the rest are ;;;; standard for many systems. UNIVERSAL-TESTER makes use of *GOAL* and *THEORY* in some situations. ;;;; *** Learning systems should not alter the values of these variables or destructively modify them *** ;;;; *CATEGORIES*: A list of all categories (classes) present in the data. ;;;; *FEATURE-NAMES*: An ordered list of names for the features used to describe examples. ;;;; *DOMAINS*: An ordered list of domains for each feature where a domain is either a list ;;;; of possible values or the symbol LINEAR to indicate a real valued feature. ;;;; *RAW-EXAMPLES*: A list of examples where the first element of an example is its class. ;;;; The two standard formats for examples assumed by many systems are: ;;;; Ordered example: ( ) ;;;; e.g. (+ (big red square)) ;;;; Alist example: ( ( ) ... ) ;;;; e.g. (+ (size big) (color red) (shape square)) ;;;; ID3-ALL works with both where a sequence of feature values can be a list or an array. ;;;; UNIVERSAL-TESTER only assumes that the first element is the class and as long as ;;;; the learners are happy with the example format it is too. ;;;; *THEORY*: For theory revision problems a list of rules suitable for DEDUCE. ;;;; *GOAL*: A goal to be proven for positive examples using deduce, e.g. (CUP) ;;; Global variables used in data files (defvar *raw-examples* nil "List of examples") (defvar *feature-names* nil "A list of names for each feature") (defvar *domains* nil "A list defining the domain of each feature in the vector") (defvar *categories* '(+ -) "A list of all categories in the data") (defvar *theory* nil "Initial domain theory") (defvar *goal* nil "Top-level goal for domain theory") (defvar *missing-value* '? "Feature value representing missing") (defvar *noise* nil "Perform extra processing to handle noise and/or prevent over-fitting") (defvar *tie-breaker* 'random "How to break ties (random or ordered)") (defmacro trace-print (test-var &rest format-form) ;;; When test-var (usually a *-trace variable) is set then use formated print `(if ,test-var (format t ,@format-form))) (defmacro make-variant (new-system system parameters) `(progn (defun ,(append-symbols 'train- new-system) (examples) (let , parameters (, (append-symbols 'train- system) examples))) (defun ,(append-symbols 'test- new-system) (example result) (let , parameters (, (append-symbols 'test- system) example result))) (if (fboundp (quote , (append-symbols 'train- system '-output))) (defun ,(append-symbols 'train- new-system '-output) (result examples) (let , parameters (, (append-symbols 'train- system '-output) result examples)))) (if (fboundp (quote ,(append-symbols 'test- system '-output))) (defun ,(append-symbols 'test- new-system '-output) (result examples) (let , parameters (, (append-symbols 'test- system '-output) result examples)))) (if (fboundp (quote ,(append-symbols system '-concept-complexity))) (defun ,(append-symbols new-system '-concept-complexity) (result) (let , parameters (, (append-symbols system '-concept-complexity) result)))) (setf (get (quote , new-system) 'parent-systems) (quote (, system))) (setf (get (quote , new-system) 'expect-training-error) (get (quote , system) 'expect-training-error)) (setf (get (quote , new-system) 'incremental) (get (quote , system) 'incremental)))) ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Access functions for various info about data ;;;;---------------------------------------------------------------------------- --------------------------------- (defun feature-name (feature-num) "Returns the name of a feature given its number" (or (elt *feature-names* feature-num) feature-num)) (defun feature-domain (feature) "Returns the domain of a feature given its number" (if (numberp feature) (elt *domains* feature) (elt *domains* (position feature *feature-names*)))) (defun feature-number (feature) (position feature *feature-names*)) (defun feature-value (feature example) (elt (second example) (if (numberp feature) feature (feature-number feature)))) (defun binary-feature-p (feature) (member (feature-domain feature) '((true false)(false true)) :test #'equal)) (defun linear-feature-p (feature) "Returns T is given feature is linear" (eq (feature-domain feature) 'linear)) (defun pos-neg? () "Returns T if categories represent positives and negatives" (and (eq (length *categories*) 2) (or (member 'negative *categories*) (member '- *categories*)))) (defun positive-category () (if (member (first *categories*) '(- negative)) (second *categories*) (first *categories*)))) (defun negative-category () (if (member (first *categories*) '(- negative)) (first *categories*) (second *categories*)))) ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Functions for checking data for consistency ;;;;---------------------------------------------------------------------------- --------------------------------- (defun check-data (&optional (examples *raw-examples*)) (unless (eq (length *domains*)(length *feature-names*)) (format t "~%~%Length of *domains* not same as *feature-names*")) (dolist (feature-name *feature-names*) (unless (symbolp feature-name) (format t "~%~%Illegal entry in *feature-names*: ~A" feature-name))) (dolist (domain *domains*) (unless (or (eq domain 'linear) (and domain (every #'atom domain))) (format t "~%~%Illegal entry in *domains**: ~A" domain))) (cond ((alist-example-p (first examples)) (format t "~%~% Looks like a-list examples") (mapc #'check-alist-example examples)) (t (format t "~%~%Looks like ordered examples") (mapc #'check-ordered-example examples))) nil) (defun check-alist-example (example) (unless (member (first example) *categories*) (format t "~%~%Unknown class for: ~A" example)) (dolist (pair (rest example)) (unless (and (listp pair) (member (first pair) *feature-names*) (null (rest (rest pair))) (or (and (rest pair) (or (and (numberp (second pair)) (linear-feature-p (first pair))) (member (second pair) (feature-domain (first pair))) (eq (second pair) *missing-value*))) (binary-feature-p (first pair)))) (format t "~%~%Illegal feature: ~A in ~%~A" pair example)))) (defun check-ordered-example (example) (unless (member (first example) *categories*) (format t "~%~%Unknown class for: ~A" example)) (unless (and (or (consp (second example)) (arrayp (second example))) (= (length (second example)) (length *domains*))) (format t "~%~%Illegal example: ~A" example)) (dotimes (i (length *domains*)) (let ((value (elt (second example) i)) (domain (elt *domains* i))) (unless (or (eq value *missing-value*) (and (numberp value) (eq domain 'linear)) (and (listp domain) (member value domain))) (format t "~%~%Illegal feature: ~A = ~A in ~%~A" (feature-name i) value example))))) (defun find-conflicting-examples (&optional (examples *raw-examples*)) "Return list of lists of examples that have all same features but not all same class" (when examples (let (conflict matches (test (first examples))) (dolist (ex (rest examples)) (when (equalp (rest test) (rest ex)) (push ex matches) (if (not (eq (first test) (first ex))) (setf conflict t)))) (if conflict (cons (cons (first examples) matches) (find-conflicting-examples (set-difference (rest examples) matches))) (find-conflicting-examples (rest examples)))))) ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Functions for checking theory ;;;;---------------------------------------------------------------------------- --------------------------------- (defvar *augmented-feature-names* nil) (defun check-theory-antecedents (&optional (theory *theory*)) (setf *augmented-feature-names* (append *feature-names* '(< <= = >= >))) (let* (bad-rules-and-antecedents bad-consequents good-consequents (answer (dolist (rule theory (list (reverse bad-rules-and-antecedents) (reverse bad-consequents))) (dolist (antecedent (antecedents rule)) (when (eq (first antecedent) 'not) (setf antecedent (second antecedent))) (let ((predicate (first antecedent))) (if (member predicate *augmented-feature-names*) (when (and (= (length antecedent) 2) (not (eq (domain predicate) 'linear))) (let ((value (second antecedent))) (unless (or (pcvar-p value) (member value (domain predicate))) (format t "~%The value for predicate ~A is not within the domain of its attribute in rule ~%~1T~A" predicate rule) (setf bad-rules-and-antecedents (cons (list rule antecedent) bad-rules-and-antecedents))))) (unless (find antecedent (remove rule theory) :key 'consequent :test 'equal) (format t "~%Antecedent ~A is not implied by any rule and is not a feature in rule ~%~A." antecedent rule) (setf bad-rules-and-antecedents (cons (list rule antecedent) bad-rules-and-antecedents)))))) (let ((consequent (consequent rule))) (or (member consequent good-consequents :test 'equal-or-av) (member consequent bad-consequents :test 'equal-or-av) (if (find-consequent-in-theory consequent theory) (setf good-consequents (cons consequent good-consequents)) (setf bad-consequents (cons consequent bad-consequents)))))))) (if (and (null (first answer)) (null (second answer))) (format t "~%Theory is O.K.") answer))) (defun find-consequent-in-theory (consequent theory) (if (or (member (first consequent) *categories*) (dolist (rule theory) (when (member consequent (antecedents rule) :test 'equal-or-av) (return t)))) consequent (let ((rules (find-rules consequent theory))) (format t "~%~%The rule~P ~{~%~1T~A ~} ~%~[is ~;are ~] not used by the theory." (length rules) rules (if (> (length rules) 1) 1 0))))) (defun equal-or-av (list1 list2) (or (equal list1 list2) (and (consp list1) (consp list2) (> (length list1) 1) (> (length list2) 1) (set-equal list1 list2 :test #'alphabetic-variant)))) (defun alphabetic-variant (literal1 literal2) (or (equal literal1 literal2) (and (consp literal1) (consp literal2) (eq (first literal1) (first literal2)) (pcvar-p (second literal1)) (pcvar-p (second literal2)) (if (and (third literal1)(third literal2)) (= (third literal1) (third literal2)) (and (null (third literal1)) (null (third literal2))))))) ;using ;fact that (third '(a b)) ;=> nil. ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Functions for computing information about data ;;;;---------------------------------------------------------------------------- --------------------------------- (defun compute-domains (&optional (features *feature-names*) (examples *raw-examples*)) "Compute feature domains for alist examples" (mapcar #'(lambda (feature) (compute-domain feature examples)) features)) (defun compute-domain (feature &optional (examples *raw-examples*)) (let (domain) (if (alist-example-p (first examples)) (dolist (ex examples domain) (let ((pair (assoc feature (rest ex)))) (if pair (pushnew (second pair) domain)))) (dolist (ex examples domain) (pushnew (feature-value feature ex) domain))))) (defun compute-ranges (&optional (features *feature-names*) (examples *raw-examples*)) "For each fetaure in features with undefined range, compute it from the examples. Return list of (feature-name min max)" (setf examples (make-ordered-examples examples)) (let (ranges) (dolist (feature features (nreverse ranges)) (when (and (linear-feature-p feature) (null (get feature 'range))) (let ((min most-positive-fixnum)(max most-negative-fixnum) value) (dolist (example examples (push (list feature min max) ranges)) (setf value (feature-value feature example)) (unless (eq value *missing-value*) (when (< value min) (setf min value)) (when (> value max) (setf max value))))))))) (defun describe-theory (&optional (theory *theory*)) (let* ((groups (group-rules theory)) (num-rules (length theory)) (num-conses (length groups)) (num-antes (count-antecedents theory))) (format t "~%~%Number of rules: ~A" num-rules) (format t "~%Number of consequents: ~A" num-conses) (format t "~%Number of symbols: ~D" (+ num-antes num-rules)) (format t "~%Average number of disjuncts: ~,2F" (/ num-rules num-conses)) (format t "~%Average number of antecedents: ~,2F" (/ num-antes num-rules)))) (defun count-antecedents (rules) (let ((sum 0)) (dolist (rule rules sum) (incf sum (length (brule-antecedents rule)))))) (defun group-rules (rules) (let (alist) (dolist (rule rules alist) (let ((set (assoc (brule-consequent rule) alist :test #'equal))) (if set (nconc set (list rule)) (push (list (brule-consequent rule) rule) alist)))))) (defun describe-data (&optional count-missing (examples *raw-examples*)) (let* ((groups (group-examples examples)) (num-examples (length examples)) (num-categories (length groups)) (unrep-cats (set-difference *categories* (mapcar #'first groups))) (num-feature-values (reduce #'+ (mapcar #'(lambda (d) (if (atom d) 0 (length d))) *domains*))) (num-features (length *feature-names*)) (num-linear-features (count-if #'(lambda (d) (eq d 'linear)) *domains*)) (num-binary-features (count-if #'(lambda (d) (or (equal d '(true false)) (equal d '(false true)))) *domains*)) (num-nominal-features (- num-features num-linear-features num-binary-features))) (format t "~%Number of examples: ~A" num-examples) (format t "~%Number of features: ~A" num-features) (format t "~%Number of linear features: ~A (~,2F%)" num-linear-features (* 100 (/ num-linear-features num-features))) (format t "~%Number of binary features: ~A (~,2F%)" num-binary-features (* 100 (/ num-binary-features num-features))) (format t "~%Number of nominal features: ~A (~,2F%)" num-nominal-features (* 100 (/ num-nominal-features num-features))) (format t "~%Number of categories: ~A (~,2F% random guess)" num-categories (* 100 (/ 1 num-categories))) (format t "~%Unrepresented categories: ~A" unrep-cats) (format t "~%Number of examples per category:") (dolist (group (sort groups #'(lambda (g1 g2) (member (first g2) (member (first g1) *categories*))))) (let ((n (length (rest group)))) (format t "~% ~A: ~A (~,2F%)" (first group) n (* 100 (/ n num-examples))))) (format t "~%Average number of examples per category: ~,2F" (/ num-examples num-categories)) (unless (zerop (- num-features num-linear-features)) (format t "~%Average number of feature values: ~,2F" (/ num-feature-values (- num-features num-linear-features)))) (if count-missing (count-missing examples)))) (defun group-examples (examples) (let (alist) (dolist (example examples alist) (let ((set (assoc (first example) alist))) (if set (nconc set (list example)) (push (list (first example) example) alist)))))) (defun print-theory (&optional (theory *theory*)) (dolist (rule theory) (format t "~%~(~A~) <-~{ ~(~A~)~}" (brule-consequent rule) (brule-antecedents rule))) (format t "~%~%Observable features:~{ ~(~A~)~}" *feature-names*) (format t "~%~%Categories: ~{ ~(~A~)~}" *categories*)) (defun count-missing (&optional (examples *raw-examples*)) "Give info about missing values" (setf examples (make-ordered-examples examples)) (let ((value-count 0) (example-count 0) (num-examples (length examples)) (num-features (length (second (first examples))))) (dolist (example examples) (let ((count (count *missing-value* (second example)))) (unless (zerop count) (incf value-count count) (incf example-count)))) (format t "~%There are ~A missing values (~,2F%)" value-count (* 100 (/ value-count (* num-examples num-features)))) (format t "~%There are ~A examples with missing values (~,2F%)" example-count (* 100 (/ example-count num-examples))))) (defun delete-examples-with-missing (&optional (examples *raw-examples*)) "Delete examples with missing values for ordered examples" (delete-if #'(lambda (ex) (member *missing-value* (second ex))) (make-ordered-examples examples))) (defun remove-categories (categories &optional (examples *raw-examples*)) "Delete examples in these categories" (remove-if #'(lambda (ex) (member (first ex) categories)) examples)) ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Functions for Testing Systems and Theory ;;;;---------------------------------------------------------------------------- --------------------------------- (defun test-system (system training-result test-examples &optional (print-results t)) (let ((test-function (append-symbols 'test- system)) (num-examples (length test-examples)) (num-correct 0) answer) (dolist (example test-examples) (setf answer (funcall test-function example training-result)) (when (eq answer (first example)) (incf num-correct)) (trace-print print-results "~%~AReal category: ~A; Classified as: ~A" (if (eq answer (first example)) " " "**") (first example) answer)) (format t "~%~%~A classified ~,2F% of the ~D test cases correctly." test-function (* 100 (/ num-correct num-examples)) num-examples))) (defvar *number-nils* 0) (defvar *number-overgeneral* 0) (defvar *number-exactly-right* 0) (defun test-theory (&optional (theory *theory*) (examples *raw-examples*) (print-results t)) "Test the accuracy of a theory on a set of examples" (setf *number-nils* 0) (setf *number-overgeneral* 0) (setf *number-exactly-right* 0) (let* ((num-exs (length examples)) (accuracy (if examples (let ((number-right 0)) (dolist (example examples (* 100.0 (/ number-right num-exs)) ) (setf number-right (+ number-right (test-theory-example example theory print-results))))) 100))) (format t "~%~%Theory classified ~,2F% of the ~D test cases correctly." accuracy num-exs) (format t "~%Number not classified in any category: ~D (~,2F%)" *number-nils* (* 100 (/ *number-nils* num-exs))) (format t "~%Number classified exactly right: ~D (~,2F%)" *number-exactly-right* (* 100 (/ *number-exactly-right* num-exs))) (format t "~%Number classified overly-general: ~D (~,2F%)" *number-overgeneral* (* 100 (/ *number-overgeneral* num-exs))) (format t "~%Number classified overly-specific: ~D (~,2F%)" (- num-exs *number-overgeneral* *number-exactly-right*) (* 100 (/ (- num-exs *number-overgeneral* *number-exactly-right*) num-exs))) )) (defun test-theory-example (example &optional (theory *theory*) (print-results t)) "Return probability that example is classified correctly by theory" (setf example (make-alist-example example)) (let ((provable-categories (provable-example-categories example theory *categories* print-results))) (cond ((null provable-categories) (incf *number-nils*) (if (member 'negative *categories*) (if (eq (first example) 'negative) 1 0) (/ 1 (length *categories*)))) ((member (first example) provable-categories) (if (rest provable-categories) (incf *number-overgeneral*) (incf *number-exactly-right*)) (/ 1 (length provable-categories))) (t 0)))) (defun provable-example-categories (example &optional (theory *theory*) (categories *categories*) (print-results t)) "Reutrn the list of categories example is provable in" (if (or (equal categories '(+ -)) (equal categories '(- +))) (if (prove theory example) '(+) '(-)) (let (proved-categories) (dolist (category categories) (if (prove theory example (list category)) (push category proved-categories))) (trace-print print-results "~%~AReal category: ~A; Proved categories: ~A" (if (or (and (eq (first example) (first proved-categories)) (null (rest proved-categories))) (and (eq (first example) 'negative) (null proved-categories))) " " "**") (first example) proved-categories) proved-categories))) ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Functions for converting between different data formats ;;;;---------------------------------------------------------------------------- --------------------------------- (defun alist-example-p (example) "Return T if example is in alist form" (or (rest (rest example)) (null (rest example)) (let ((second-elt (second example))) (and (consp second-elt) (null (rest (rest second-elt))) (member (first second-elt) *feature-names*))))) (defun make-ordered-examples (examples &optional (array-flag 'prefer)) (cond ((alist-example-p (first examples)) (mapcar #'(lambda (ex) (convert-to-ordered-example ex array-flag)) examples)) ((and (null array-flag) (arrayp (second (first examples)))) (mapcar #'(lambda (ex) (list (first ex) (coerce (second ex) 'list))) examples)) ((and (eq array-flag t) (listp (second (first examples)))) (mapcar #'(lambda (ex) (list (first ex) (coerce (second ex) 'array))) examples)) (t examples))) (defun make-ordered-example (example &optional (array-flag 'prefer)) (cond ((alist-example-p example) (convert-to-ordered-example example array-flag)) ((and (null array-flag) (arrayp (second example))) (list (first example) (coerce (second example) 'list))) ((and (eq array-flag t) (listp (second example))) (list (first example) (coerce (second example) 'array))) (t example))) (defun convert-to-ordered-example (example &optional (array-flag t)) "Convert an example from alist form to ordered feature form (an array if flag set)" (let ((counter 0) output-example (array (if array-flag (make-array (length *domains*))))) (dolist (feature-name *feature-names* (list (first example) (if array-flag array (reverse output-example)))) (let* ((item (find feature-name (cdr example) :key #'car)) (value (if (and item (second item)) (second item) (let ((domain (elt *domains* counter))) (if (and (consp domain)(member 'true domain)) (if item 'true 'false) (if item (error "No value for ~A" item) *missing-value*)))))) (if array-flag (setf (aref array counter) value) (push value output-example)) (incf counter))))) (defun make-alist-examples (examples) (if (not (alist-example-p (first examples))) (mapcar #'convert-to-alist-example examples) examples)) (defun make-alist-example (example) (if (not (alist-example-p example)) (convert-to-alist-example example) example)) (defun convert-to-alist-example (example) (let (instance) (dotimes (i (length *feature-names*) (cons (first example) (reverse instance))) (let ((value (elt (second example) i))) (unless (or (eq value *missing-value*) (eq value 'false)) (push (if (eq value 'true) (list (elt *feature-names* i)) (list (elt *feature-names* i) value)) instance)))))) (defun convert-category-examples () (mapcan #'(lambda (cat) (mapcar #'(lambda (instance) (list cat instance)) (eval cat))) *categories*)) (defun make-deduce-rules (theory) (mapcar #'(lambda (rule) (cons '<- (cons (first rule) (rest (rest rule))))) theory)) (defun make-array-examples (examples) (mapcar #'(lambda (ex) (list (first ex) (let ((array (make-array (length *feature-names*)))) (dotimes (i (length *feature-names*) array) (setf (aref array i) (elt (second ex) i)))))) examples)) ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Accessory functions for data ;;;;---------------------------------------------------------------------------- --------------------------------- (defun make-domains (levels-list) ;;; If features values are simply integers 0 to n then the number of values of a ;;; feature is sufficient for determining its domain. This function creates a ;;; list suitable for *domains* given a list of the number of values for each feature. ;;; See the file SOYBEAN-RDATA for a sample use. (mapcar #'(lambda (levels) (let ((domain nil)) (dotimes (i levels domain) (setf domain (nconc domain (list i)))))) levels-list)) (defun make-examples (pos-instances neg-instances) ;;; Converts lists of positive and negative instances into a list of examples ;;; suitable for ID3. (append (mapcar #'(lambda (instance) (list '+ instance)) pos-instances) (mapcar #'(lambda (instance) (list '- instance)) neg-instances))) (defun make-ranges (range-list) (dolist (range-element range-list) (setf (get (first range-element) 'range) (rest range-element)))) ;;;;---------------------------------------------------------------------------- --------------------------------- ;;;; Miscellaneous functions ;;;;---------------------------------------------------------------------------- --------------------------------- (defun maximum-label (count-alist &optional tie-breaker-list) "Returns the label in count-alist ((label . count) ...) with the maximum count. Break ties according to *tie-breaker*" (let (max-labels (max-count 0)) (dolist (count-cons count-alist) (cond ((> (cdr count-cons) max-count) (setf max-count (cdr count-cons)) (setf max-labels (list (car count-cons)))) ((= (cdr count-cons) max-count) (push (first count-cons) max-labels)))) (if (or (eq *tie-breaker* 'random) (null tie-breaker-list)) (pick-one max-labels) (dolist (item tie-breaker-list) (when (member item max-labels) (return item)))))) (defun pick-one (list) "Pick an item randomly from the list" (nth (random (length list)) list)) (defun read-file (file-name) (with-open-file (input file-name :direction :input) (read input nil nil))) (defun write-data-file (filename &optional (var-list '(*feature-names* *domains* *categories* *theory* *raw-examples*)) (pretty-print t)) (let ((*print-pretty* pretty-print)) (with-open-file (file filename :direction :output :if-exists :new-version) (format file ";;; -*- Mode:Common-Lisp; Package:USER -*-~%") (dolist (var var-list) (if (eq var '*raw-examples*) (progn (format file "~%(setf *raw-examples* '(") (dolist (ex (eval var)) (format file "~% ~A" ex)) (format file "~%))")) (format file "~%(setf ~A~% ~A)~%" var (list 'quote (eval var)))))))) (defun read-line-list (stream) (read-from-string (concatenate 'string "(" (read-line stream nil) ")"))) (defun square (x) (* x x)) (defun /-float (a b) "Division forcing a floating point output" (/ (coerce a 'single-float) (coerce b 'single-float))) (defun append-symbols (&rest symbols) (intern (format nil "~{~A~}" symbols))) (defun seconds-since (time) ;;; Return seconds elapsed since given time (initially set by get-internal-run-time) (/ (- (get-internal-run-time) time) internal-time-units-per-second))