; ********************************************************************* ; DATAGEN.L ; Copyright (1989) Patrick W. Langley ; Common Lisp code for generating artificial data sets. ; This code supports both logical and probabilistic categories, ; and handles both one-level and structured concepts and instances. ; It currently handles only attribute-value representations. ; ********************************************************************* ; ********************************************************************* ; FUNCTIONS FOR INITIALIZING DISTRIBUTIONAL INFORMATION. ; ********************************************************************* ; GENERATE-DIST transforms the probabilistic descriptions in the global ; variables CATEGORIES* and DEFAULT-ATTRIBUTES* into a form the function ; GENERATE-INSTANCE can use, storing this information in the global ; variables CATEGORY-LIST* and DEFAULT-VALUE-LIST*. These contain ; information about the distributions of categories and values. (defun generate-dist () (setq category-list* nil) (make-component 'top-level) (setq default-value-list* (mapcar #'make-default default-attributes*)) (princ '|Distributions generated.|) nil) ; MAKE-DEFAULT generates a distribution for the irrelevant attributes ; stored on the global variable DEFAULT-ATTRIBUTES*. (defun make-default (avlist) (cons (car avlist) (make-dist (cdr avlist)))) ; GENERATE-CAT-DIST accepts a list of the form (C A1 (V11 P11 V12 P12 ...) ; A2 (V21 P21 ...) ... An (Vn1 Pn1 ...)), where C is a category name, Ai ; is an attribute, Vij is a possible value, and Pij is the probability of ; a given value. It returns an expanded form of this description, where ; each value Vij occurs Pij*R times, where R is the resolution at which ; random numbers are generated (as specified by the global variable ; RESOLUTION*). (defun generate-cat-dist (clist) (prog (name avlist attribute result) (setq name (pop clist)) loop (cond ((null clist)(return (cons name result)))) (setq avlist (pop clist)) (setq attribute (pop avlist)) (cond ((null avlist) (push (list attribute) result)) (t (push (cons attribute (make-dist avlist)) result))) (go loop))) (defun make-dist (vlist) (prog (result value score) loop (cond ((null vlist)(return result))) (setq value (make-component (pop vlist))) (setq score (pop vlist)) (setq result (append (make-n value (round (* score resolution*)) nil) result)) (go loop))) (defun make-component (value) (prog (dist expanded) (setq dist (assoc value categories*)) (cond ((null dist)(return value))) (setq expanded (assoc value category-list*)) (cond ((null expanded) (setq expanded (generate-cat-dist dist)) (push expanded category-list*))) (return expanded))) ; MAKE-N generates a list containing N copies of X, assuming R is NIL. ; E.g., the top-level call (MAKE-N 'A 4 NIL) would give '(A A A A) as ; a result. (defun make-n (x n r) (cond ((zerop n) r) (t (cons x (make-n x (1- n) r))))) ; ********************************************************************* ; FUNCTIONS FOR GENERATING INSTANCES ; ********************************************************************* ; GENERATE-INSTANCE is the top-level function for generating an instance. ; It should ONLY be called AFTER invoking GENERATE-DIST. The current ; function assumes an expanded form of the probability distributions are ; available for use in random selection. (defun generate-instance () (cadr (generate-components (assoc 'top-level category-list*)))) (defun generate-components (avlist) (prog (category instance attributes vlist) (setq category (pop avlist)) (push category instance) (setq attributes (cdr (assoc category attribute-template*))) loop (cond ((null attributes)(return (reverse instance)))) (setq next (pop attributes)) (setq vlist (cdr (assoc next avlist))) (push (select-attribute next vlist) instance) (go loop))) (defun select-attribute (attribute vlist) (cond ((null vlist) (select-default attribute)) (t (select-random vlist)))) (defun select-random (vlist) (prog (value) (setq value (nth (random resolution*) vlist)) (cond ((atom value)(return value)) (t (return (generate-components value)))))) (defun select-default (attribute) (select-random (cdr (assoc attribute default-value-list*)))) ; ********************************************************************* ; SAMPLE INPUTS, GIVEN AS GLOBAL VARIABLES ; ********************************************************************* ; The variable RESOLUTION* specifies the degree to which the random ; selection mechanism approximates the given probabilities. E.g., if ; the probability of category X is 0.543, then setting RESOLUTION* to ; 10 would generate X with 0.5 probability, setting it to 100 would ; produce it with 0.54 probability, and setting it to 1000 would give ; a 0.543 probability of occurrence. The first setting would store 10 ; values for each attribute or role, the second would store 100, and ; the last would store 1000 values, making it somewhat space intensive. (setq resolution* 10) ; ATTRIBUTE-TEMPLATE* specifies the order in which attribute values ; should occur for instances of each category. E.g., the entry (X ROLE1 ; ROLE2) states that the object in ROLE1 should occur first in all ; instances of X, followed by the object in ROLE2. The entry (A TEXTURE ; SIZE COLOR SHAPE) states that, for instances of category A, the ; textture should be followed by the size, then by color, and finally ; by shape. Thus, one instance might be (A SHINY LARGE RED CIRCLE). ; Note that the category name appears as the CAR of the instance; this ; is included for use by the programmer, not by a learning system. (setq attribute-template* '((top-level object) (X role1 role2) (Y role1 role2) (A texture size color shape) (B texture size color shape) (C texture size color shape))) ; CATEGORIES* specifies the probabilistic description for each category, ; along with the probablity of its occurence in a given context. Each ; entry in this list has the form (C A1 (V11 P11 V12 P12 ...) A2 (V21 ; P21 ...) ... An (Vn1 Pn1 ...)), where C is a category name, Ai is an ; attribute or role, Vij is a possible value or component category, and ; Pij is the probability of a given value or component. This list should ; ALWAYS contain a pseudo-category named TOP-LEVEL, which has a single ; role/attribute named OBJECT. The values of this object specify the ; possible top-level categories and their probabilities. The symbols ; TOP-LEVEL and OBJECT never appear in actual instance descriptions, ; but they must be included in CATEGORIES*. ; Here is an example involving structured, probabilistic concepts. (setq categories* '((top-level (object X 0.7 Y 0.3)) (X (role1 A 1.0 B 0.0 C 0.0) (role2 A 0.0 B 0.5 C 0.5)) (Y (role1 A 0.0 B 1.0 C 0.0) (role2 A 0.5 B 0.0 C 0.5)) (A (size large 1.0 small 0.0) (color blue 0.0 red 0.5 green 0.5)) (B (size large 0.0 small 1.0) (color blue 1.0 red 0.0 green 0.0)) (C (size large 1.0 small 0.0) (color blue 1.0 red 0.0 green 0.0)))) ; Here is another example involving one-level, logical concepts. ; (setq categories* ; '((top-level (object A 0.5 B 0.5)) ; (A (size large 0.0 small 1.0) ; (color blue 1.0 red 0.0 green 0.0)) ; (B (size large 1.0 small 0.0) ; (color blue 0.0 red 1.0 green 0.0)))) ; The global variable DEFAULT-ATTRIBUTES* specifies the probability ; distributions for "irrelevant" attributes and roles; i.e., ones that ; occur in every instance, but do not differ across categories. (setq default-attributes* '((shape circle 0.7 square 0.3) (texture furry 0.5 shiny 0.5)))