;;;; UNIMEM.LISP - Lebowitz's incremental conceptual clustering program. ;;;; (See Lebowitz's ML2 article and MLJ 2:2 article for further details.) ;;;; This system incrementally processes unclassified examples, ;;;; dynamically modifying a hierarchy of concepts as it does so. ;;;; First, the system determines all of the places in the current ;;;; concept hierarchy that a new instance belongs. (There may be ;;;; more than one.) This is done "pragmatically" - matches do not ;;;; have to be exact. Once the locations for a new instance are determined, ;;;; UNIMEM determines if the new instance is similar enough to any instances ;;;; previously stored. If so, a new node in the concept hierarchy is formed, ;;;; using commonalities between the matching instances to define the new concept. ;;;; Statistics are kept on the features that describe a concept. Unreliable features ;;;; are discarded, and if enough features of a concept are discarded, the concept ;;;; itself is also discarded (along with all of the instances stored there). ;;;; (Discarding of features and concepts is not implemented and is left as a homework exercise.) ;;;; Copyright (C) 1988 by Jude William Shavlik and Raymond Joseph Mooney ;;;; This program may be freely copied, used, or ;;;; modified provided that this copyright notice ;;;; is included in each copy of this code and parts thereof. ;;;; The following properties of concepts are maintained: ;;; features - a conjuction description of the concept (feature/value pairs) ;;; sub-concepts - concepts directly below this one in the concept hierarchy ;;; parent - the concept directly above ;;; instances - instances located at this concept ;;; feature-confidences - confidence values for the features of this concept ;;; features-discarded - features of this concept deemed irrelevant ;;; frozen-features - features deemed definitely significant ;;;; Also, features have the property "type" (one of nominal, linear, or structured). ;;;; If a feature does not have a type, it is assumed to be nominal. ;;;; Currently, the system only handles nominal and linear features. ;;;; Sample data for the system can be found in the file DISCOVERY-DATA. ;;;; This file also describes the expected representation of instances. ;;;; A strength of UNIMEM is that is handles inconsistent and incorrect data. ;;;; All of the feature values of an instance do not have to be specified. ;;;; -------------------------------------------------------------------------------------- ;;;; Global Variables ;;;; -------------------------------------------------------------------------------------- (defvar *root* nil "The root of the concept hierarchy.") (defvar *trace-UNIMEM?* t "Report on the progress of the program?") (defvar *concept-match-threshold* 0.8 "Min partial match score for category membership.") (defvar *acceptable-match-threshold* 0.75 "Min partial match score between two features.") (defvar *new-concept-threshold* 0.50 "Min match between two ex's if new concept formed.") (defvar *missing-feature-score* 0.10 "Partial match score for absent feature.") (defvar *features-needed-to-gen* 1 "Min common features if new concept is to be created.") (defvar *delete-confidence-level* -1 "Confidence level at which a feature is deleted.") (defvar *freeze-confidence-level* 5 "Confidence level at which a feature is permanent.") (defvar *keep-concept-threshold* 0.33 "Frac of initial features remaining to keep concept.") ;;;; -------------------------------------------------------------------------------------- ;;;; The Main Functions ;;;; -------------------------------------------------------------------------------------- (defun unimem (example) ;;; Insert this example into memory, altering the structure of memory as necessary. (format t "~%Considering ~A, which has features~% " example) (print-list-contents (eval example) t t) (dolist (answer (unimem-search *root* (eval example))) (update (first answer) example (second answer)))) (defun run-unimem (examples &optional (randomize t)) ;;; Run a collection of examples through the UNIMEM program. ;;; Randomize example order if flag is T (format t "~%~%Initiating UNIMEM ... ~%~%") (gensym 0) (setf *root* (gensym "Concept-")) (initialize-unimem-concept *root*) (dolist (example (if randomize (permute examples) examples)) (unimem example)) (format t "~%Finished processing the provided examples.~%~%The final memory structure:~%~%") (report-memory *root*) (format t "~%~%") *root*) (defun unimem-search (concept unexplained-features) ;;; Determine where in the current tree this concept belongs. ;;; For each most specific concepts found, return a list of ( ) (evaluate-concept-features concept unexplained-features) ; Update feature scores. (unless (remove-concept? concept) ; Check for deleted concept (if (concept-member? concept unexplained-features) ; Close enough match? (let ((remaining-unexplained-features (collect-unexplained-features concept unexplained-features))) (or (mapcan #'(lambda (sub-concept) (unimem-search sub-concept remaining-unexplained-features)) (get concept 'sub-concepts)) ; See if concept also fits lower in tree. (list (list concept remaining-unexplained-features))))))) ; If not, here's the place. (defun update (concept new-instance new-features &aux inserted?) ;;; Insert this instance into memory. Currently it fits here. However, ;;; see if any new concepts should be created. (when *trace-UNIMEM?* (format t " ~A matches the concept defined by the features~% " new-instance) (print-list-contents (get-all-features concept) t t) (format t " Other instances stored here:") (print-list-contents (get concept 'instances) t)) (dolist (old-instance (get concept 'instances)) ; Look at all of the instances already stored here. (if (close-enough-to-build-new-concept? new-instance old-instance) (setf inserted? (or inserted? ; Record if the new instance has been inserted. (build-new-concept concept ; See if a new concept possible. old-instance new-instance new-features))))) (unless inserted? (push new-instance (get concept 'instances)) (if *trace-UNIMEM?* (format t " Inserting ~A into this concept.~%" new-instance)))) (defun build-new-concept (parent old-instance new-instance features-to-consider) ;;; Build a new concept under this parent concept. ;;; Put these two instance in it. The features of the new node are those ;;; features in the feature list that have "close-enough" values in the two instances. ;;; Make sure there are enough features for the the new concept. ;;; (The features to consider are a subset of new-instance's features.) (let* ((old-instance-features (eval old-instance)) (common-features (remove-if-not #'(lambda (feature) (>= (feature-match-score feature old-instance-features) *acceptable-match-threshold*)) features-to-consider))) (if (>= (length common-features) *features-needed-to-gen*) ; Enough common features? (let ((new-concept (gensym "Concept-"))) (initialize-unimem-concept new-concept) (setf (get parent 'instances) (remove old-instance (get parent 'instances))) (push new-concept (get parent 'sub-concepts)) (setf (get new-concept 'parent) parent) (setf (get new-concept 'instances) (list old-instance new-instance)) (setf (get new-concept 'features) (combine-feature-values common-features old-instance-features)) (setf (get new-concept 'feature-confidences) (mapcar #'(lambda (feature-value-pair) ; Count the two "creating" instances. (list (first feature-value-pair) 2)) common-features)) (when *trace-UNIMEM?* (format t " New concept formed from ~A and ~A.~%" new-instance old-instance) (format t " Its features are~% ") (print-list-contents (get new-concept 'features) t t)) new-concept)))) ; Indicate a new concept was formed. (defun evaluate-concept-features (concept unexplained-features) ;;; Keep statistics on the value of the features of this concept. ;;; Discard low-scoring features and "freeze" high scoring ones. ;;; If *trace-UNIMEM?* is set, report deletions and freezings. nil) ; TO BE WRITTEN, USING THE RELEVANT GLOBAL VARIABLES DEFINED ABOVE. (defun remove-concept? (concept) ;;; If a concept loses too many of its features, discard the concept. ;;; Report deletion if *trace-UNIMEM?* is set. ;;; TO BE WRITTEN USING THE RELEVANT GLOBAL VARIABLES DEFINED ABOVE. nil) (defun collect-unexplained-features (concept instance-features &aux (concept-features (get concept 'features))) ;;; Collect all of the instance features that do NOT match a feature of this concept. (remove-if #'(lambda (instance-feature) (>= (feature-match-score instance-feature concept-features) *acceptable-match-threshold*)) instance-features)) (defun close-enough-to-build-new-concept? (instance1 instance2 &aux (features1 (eval instance1)) (features2 (eval instance2))) ;;; Determine if these instances are close enough to merit creating a new concept. (>= (min (concept-match-score features1 features2) ; The match is unsymmetrical when the two (concept-match-score features2 features1)) ; instances have different features *new-concept-threshold*)) ; (some may be missing). (defun concept-match-score (concept-features instance-features &aux (number-of-features (length concept-features))) ;;; Calculate a partial match of this instance to this concept. ;;; This is done by summing the scores for matching each feature of the concept ;;; and then dividing by the maximum possible score (to normalize). (if (> number-of-features 0) (/ (reduce #'+ (mapcar #'(lambda (concept-feature) (feature-match-score concept-feature instance-features)) concept-features)) number-of-features) 1)) (defun feature-match-score (feature possible-matches &aux (match-feature (assoc (first feature) possible-matches))) ;;; Return the scoring match of this feature when compared to the possible matches. (if match-feature (score-two-features feature match-feature) *missing-feature-score*)) (defun score-two-features (feature1 feature2) ;;; Score the match between these two feature/value pairs. ;;; Return a result in [0..1]. (if (eql (second feature1) (second feature2)) 1 ; Perfect match. (case (get (first feature1) 'type) (nominal 0) ; No partial score here. (linear (partially-score-linear-feature feature1 feature2)) ; Allow partial scoring. (structured (partially-score-structured-feature feature1 feature2)) ; Allow partial scoring. (otherwise 0)))) ; Nominal is the default. (defun partially-score-linear-feature (feature1 feature2) ;;; Return a number between 0 and 1 indicating the degree to which ;;; two linear features match. Calculated as the ratio of the distance ;;; between to values over the maximum distance possible based on the ;;; min and max values stored on the linear-domain prop of the feature (let ((domain (get (first feature1) 'linear-domain))) (- 1 (/ (abs (- (second feature1)(second feature2))) (- (second domain) (first domain)))))) (defun partially-score-structured-feature (feature1 feature2) ;;; Return a number between 0 and 1 indicating the degree to which ;;; two structured feature values match (error "TO BE WRITTEN.")) (defun combine-feature-values (features1 features2) ;;; For the feature/value pairs in features1, merge ;;; the values with the corresponding values in features2. (mapcar #'(lambda (feature1 &aux (feature2 (assoc (first feature1) features2))) (if (equal feature1 feature2) ; See if an exact match. feature1 (case (get (first feature1) 'type) (nominal (error "Shouldnt occur.")) ; Shouldnt have matched partially. (linear (list (first feature1) (/ (+ (second feature1)(second feature2)) 2))) ; Combine two linear values by taking average (structured (error "TO BE WRITTEN.")) ; Combine two structured values. (otherwise (error "Shouldnt occur."))))) ; Nominal is the default. features1)) (defun concept-member? (concept instance-features) ;;; Determine if these instance features indicate that ;;; the instance is a member of this concept. (>= (concept-match-score (get concept 'features) instance-features) *concept-match-threshold*)) ;;;; -------------------------------------------------------------------------------------- ;;;; UTILITY FUNCTIONS ;;;; -------------------------------------------------------------------------------------- (defun initialize-unimem-concept (concept) ;;; Initialize this concept. (setf (get concept 'features) nil) ; A list of feature/value pairs. (setf (get concept 'features-discarded) nil) ; A list of feature names. (setf (get concept 'frozen-features) nil) ; A list of feature names. (setf (get concept 'feature-confidences) nil) ; A list ( ... (feature confidence) ... ) (setf (get concept 'instances) nil) ; A list of examples names. (setf (get concept 'parent) nil) ; The name of the parent concept. (setf (get concept 'sub-concepts) nil)) ; A list of sub-concepts. (defun hierarchy-instances (&optional (concept *root*)) ;;; Construct a nested list illustrating the instances stored in the ;;; entire memory under concept. (append (get concept 'instances) (mapcar #'(lambda (sub-concept) (hierarchy-instances sub-concept)) (get concept 'sub-concepts)))) (defun report-memory (concept &optional (indent 0) &aux (instances (get concept 'instances))) ;;; Report the contents of memory. (format t "~vT~A " indent concept) (if instances (progn (format t "has instances") (print-list-contents instances t)) (format t "has no associated instances.~%")) (report-concept-feature-status concept indent) (mapc #'(lambda (sub-concept) (report-memory sub-concept (+ 3 indent))) (reverse (get concept 'sub-concepts)))) (defun report-concept-feature-status (concept indent) ;;; Report the status of the features of this concept. ;;; Describe confidence values and report discarded features. ;;; (Frozen features are considered to have infinite confidence ;;; values, represented by ^'s.) (let ((active (get concept 'features)) (discarded (get concept 'features-discarded)) (frozen (get concept 'frozen-features)) (confidences (get concept 'feature-confidences))) (format t "~vT " indent) (format t "Its features [and confidences] are~%") (format t "~vT" indent) (if active (mapc #'(lambda (pair) (let ((name (first pair)) (value (second pair))) (format t " ~A=~A[~A]" name value (if (member name frozen) '^ (second (assoc name confidences)))))) active) (format t " ")) (when discarded (format t "~%~vT Discarded features:" indent) (print-list-contents discarded)) (format t "~%~%"))) (defun print-list-contents (list &optional linefeed? feature-value-pairs?) ;;; Print this list without the encapsulating parantheses. (if list (if feature-value-pairs? ; Use the notation feature=value (dolist (pair list) (format t " ~A=~A" (first pair) (second pair))) (dolist (item list) (format t " ~A" item))) (format t " ")) (if linefeed? (format t "~%"))) (defun get-all-features (concept) ;;; Collect all of the features of this concept, removing overridden ones. (remove-if-overridden (collect-every-feature concept))) (defun collect-every-feature (concept) ;;; Collect all the features of this concept, including inherited ones. ;;; Later features in the list should override earlier ones. (append (if (get concept 'parent) (collect-every-feature (get concept 'parent))) (get concept 'features))) (defun remove-if-overridden (features) ;;; Remove all of those features in this list that have another value later down in the list. (cond ((null features) nil) ((assoc (first (first features)) (rest features)) (remove-if-overridden (rest features))) (t (cons (first features) (remove-if-overridden (rest features)))))) (defun permute (list) "Randomize the order of the elements in this list." (mapcar #'(lambda (pair) (rest pair)) (sort (mapcar #'(lambda (item) (cons (random 1000) item)) list) #'(lambda (a b) (> (first a) (first b))))))