Logo Search packages:      
Sourcecode: weka version File versions

JRip.java

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    JRip.java
 *    Copyright (C) 2001 University of Waikato, Hamilton, New Zealand
 */

package weka.classifiers.rules;

import weka.classifiers.Classifier;
import weka.core.AdditionalMeasureProducer;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Copyable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.Filter;
import weka.filters.supervised.attribute.ClassOrder;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

/**
 <!-- globalinfo-start -->
 * This class implements a propositional rule learner, Repeated Incremental Pruning to Produce Error Reduction (RIPPER), which was proposed by William W. Cohen as an optimized version of IREP. <br/>
 * <br/>
 * The algorithm is briefly described as follows: <br/>
 * <br/>
 * Initialize RS = {}, and for each class from the less prevalent one to the more frequent one, DO: <br/>
 * <br/>
 * 1. Building stage:<br/>
 * Repeat 1.1 and 1.2 until the descrition length (DL) of the ruleset and examples is 64 bits greater than the smallest DL met so far, or there are no positive examples, or the error rate &gt;= 50%. <br/>
 * <br/>
 * 1.1. Grow phase:<br/>
 * Grow one rule by greedily adding antecedents (or conditions) to the rule until the rule is perfect (i.e. 100% accurate).  The procedure tries every possible value of each attribute and selects the condition with highest information gain: p(log(p/t)-log(P/T)).<br/>
 * <br/>
 * 1.2. Prune phase:<br/>
 * Incrementally prune each rule and allow the pruning of any final sequences of the antecedents;The pruning metric is (p-n)/(p+n) -- but it's actually 2p/(p+n) -1, so in this implementation we simply use p/(p+n) (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<br/>
 * <br/>
 * 2. Optimization stage:<br/>
 *  after generating the initial ruleset {Ri}, generate and prune two variants of each rule Ri from randomized data using procedure 1.1 and 1.2. But one variant is generated from an empty rule while the other is generated by greedily adding antecedents to the original rule. Moreover, the pruning metric used here is (TP+TN)/(P+N).Then the smallest possible DL for each variant and the original rule is computed.  The variant with the minimal DL is selected as the final representative of Ri in the ruleset.After all the rules in {Ri} have been examined and if there are still residual positives, more rules are generated based on the residual positives using Building Stage again. <br/>
 * 3. Delete the rules from the ruleset that would increase the DL of the whole ruleset if it were in it. and add resultant ruleset to RS. <br/>
 * ENDDO<br/>
 * <br/>
 * Note that there seem to be 2 bugs in the original ripper program that would affect the ruleset size and accuracy slightly.  This implementation avoids these bugs and thus is a little bit different from Cohen's original implementation. Even after fixing the bugs, since the order of classes with the same frequency is not defined in ripper, there still seems to be some trivial difference between this implementation and the original ripper, especially for audiology data in UCI repository, where there are lots of classes of few instances.<br/>
 * <br/>
 * Details please see:<br/>
 * <br/>
 * William W. Cohen: Fast Effective Rule Induction. In: Twelfth International Conference on Machine Learning, 115-123, 1995.<br/>
 * <br/>
 * PS.  We have compared this implementation with the original ripper implementation in aspects of accuracy, ruleset size and running time on both artificial data "ab+bcd+defg" and UCI datasets.  In all these aspects it seems to be quite comparable to the original ripper implementation.  However, we didn't consider memory consumption optimization in this implementation.<br/>
 * <br/>
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- technical-bibtex-start -->
 * BibTeX:
 * <pre>
 * &#64;inproceedings{Cohen1995,
 *    author = {William W. Cohen},
 *    booktitle = {Twelfth International Conference on Machine Learning},
 *    pages = {115-123},
 *    publisher = {Morgan Kaufmann},
 *    title = {Fast Effective Rule Induction},
 *    year = {1995}
 * }
 * </pre>
 * <p/>
 <!-- technical-bibtex-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -F &lt;number of folds&gt;
 *  Set number of folds for REP
 *  One fold is used as pruning set.
 *  (default 3)</pre>
 * 
 * <pre> -N &lt;min. weights&gt;
 *  Set the minimal weights of instances
 *  within a split.
 *  (default 2.0)</pre>
 * 
 * <pre> -O &lt;number of runs&gt;
 *  Set the number of runs of
 *  optimizations. (Default: 2)</pre>
 * 
 * <pre> -D
 *  Set whether turn on the
 *  debug mode (Default: false)</pre>
 * 
 * <pre> -S &lt;seed&gt;
 *  The seed of randomization
 *  (Default: 1)</pre>
 * 
 * <pre> -E
 *  Whether NOT check the error rate&gt;=0.5
 *  in stopping criteria  (default: check)</pre>
 * 
 * <pre> -P
 *  Whether NOT use pruning
 *  (default: use pruning)</pre>
 * 
 <!-- options-end -->
 *
 * @author Xin Xu (xx5@cs.waikato.ac.nz)
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version $Revision: 1.22 $
 */
00137 public class JRip 
  extends Classifier 
  implements AdditionalMeasureProducer, 
           WeightedInstancesHandler,
           TechnicalInformationHandler {    

  /** for serialization */
00144   static final long serialVersionUID = -6589312996832147161L;
  
  /** The limit of description length surplus in ruleset generation */
00147   private static double MAX_DL_SURPLUS = 64.0;
    
  /** The class attribute of the data*/
00150   private Attribute m_Class; 
    
  /** The ruleset */
00153   private FastVector m_Ruleset;
  
  /** The predicted class distribution */
00156   private FastVector m_Distributions;
  
  /** Runs of optimizations */
00159   private int m_Optimizations = 2;
    
  /** Random object used in this class */
00162   private Random m_Random = null;
    
  /** # of all the possible conditions in a rule */
00165   private double m_Total = 0;

  /** The seed to perform randomization */
00168   private long m_Seed = 1;

  /** The number of folds to split data into Grow and Prune for IREP */
00171   private int m_Folds = 3;
    
  /** The minimal number of instance weights within a split*/
00174   private double m_MinNo = 2.0;

  /** Whether in a debug mode */
00177   private boolean m_Debug = false;

  /** Whether check the error rate >= 0.5 in stopping criteria */
00180   private boolean m_CheckErr = true;

  /** Whether use pruning, i.e. the data is clean or not */
00183   private boolean m_UsePruning = true;

  /** The filter used to randomize the class order */
00186   private Filter m_Filter = null;

  /** The RuleStats for the ruleset of each class value */
00189   private FastVector m_RulesetStats;
    
  /**
   * Returns a string describing classifier
   * @return a description suitable for
   * displaying in the explorer/experimenter gui
   */
00196   public String globalInfo() {

    return "This class implements a propositional rule learner, Repeated Incremental "
      + "Pruning to Produce Error Reduction (RIPPER), which was proposed by William "
      + "W. Cohen as an optimized version of IREP. \n\n"
      + "The algorithm is briefly described as follows: \n\n"
      + "Initialize RS = {}, and for each class from the less prevalent one to "
      + "the more frequent one, DO: \n\n"
      + "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) "
      + "of the ruleset and examples is 64 bits greater than the smallest DL "
      + "met so far, or there are no positive examples, or the error rate >= 50%. "
      + "\n\n"
      + "1.1. Grow phase:\n"
      + "Grow one rule by greedily adding antecedents (or conditions) to "
      + "the rule until the rule is perfect (i.e. 100% accurate).  The "
      + "procedure tries every possible value of each attribute and selects "
      + "the condition with highest information gain: p(log(p/t)-log(P/T))."
      + "\n\n"
      + "1.2. Prune phase:\n"
      + "Incrementally prune each rule and allow the pruning of any "
      + "final sequences of the antecedents;"
      + "The pruning metric is (p-n)/(p+n) -- but it's actually "
      + "2p/(p+n) -1, so in this implementation we simply use p/(p+n) "
      + "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n"
      + "2. Optimization stage:\n after generating the initial ruleset {Ri}, "
      + "generate and prune two variants of each rule Ri from randomized data "
      + "using procedure 1.1 and 1.2. But one variant is generated from an "
      + "empty rule while the other is generated by greedily adding antecedents "
      + "to the original rule. Moreover, the pruning metric used here is "
      + "(TP+TN)/(P+N)."
      + "Then the smallest possible DL for each variant and the original rule "
      + "is computed.  The variant with the minimal DL is selected as the final "
      + "representative of Ri in the ruleset."
      + "After all the rules in {Ri} have been examined and if there are still "
      + "residual positives, more rules are generated based on the residual "
      + "positives using Building Stage again. \n"
      + "3. Delete the rules from the ruleset that would increase the DL of the "
      + "whole ruleset if it were in it. and add resultant ruleset to RS. \n"
      + "ENDDO\n\n"
      + "Note that there seem to be 2 bugs in the original ripper program that would "
      + "affect the ruleset size and accuracy slightly.  This implementation avoids "
      + "these bugs and thus is a little bit different from Cohen's original "
      + "implementation. Even after fixing the bugs, since the order of classes with "
      + "the same frequency is not defined in ripper, there still seems to be "
      + "some trivial difference between this implementation and the original ripper, "
      + "especially for audiology data in UCI repository, where there are lots of "
      + "classes of few instances.\n\n"
      + "Details please see:\n\n"
      + getTechnicalInformation().toString() + "\n\n"
      + "PS.  We have compared this implementation with the original ripper "
      + "implementation in aspects of accuracy, ruleset size and running time "
      + "on both artificial data \"ab+bcd+defg\" and UCI datasets.  In all these "
      + "aspects it seems to be quite comparable to the original ripper "
      + "implementation.  However, we didn't consider memory consumption "
      + "optimization in this implementation.\n\n";    
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing 
   * detailed information about the technical background of this class,
   * e.g., paper reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
00260   public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation      result;
    
    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "William W. Cohen");
    result.setValue(Field.TITLE, "Fast Effective Rule Induction");
    result.setValue(Field.BOOKTITLE, "Twelfth International Conference on Machine Learning");
    result.setValue(Field.YEAR, "1995");
    result.setValue(Field.PAGES, "115-123");
    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
    
    return result;
  }
    
  /**
   * Returns an enumeration describing the available options
   * Valid options are: <p>
   *  
   * -F number <br>
   * The number of folds for reduced error pruning. One fold is
   * used as the pruning set. (Default: 3) <p>
   * 
   * -N number <br>
   * The minimal weights of instances within a split.
   * (Default: 2) <p>
   *    
   * -O number <br>
   * Set the number of runs of optimizations. (Default: 2)<p>
   *
   * -D <br>
   * Whether turn on the debug mode
   *
   * -S number <br>
   * The seed of randomization used in Ripper.(Default: 1)<p>
   *
   * -E <br>
   * Whether NOT check the error rate >= 0.5 in stopping criteria.
   * (default: check)<p> 
   *
   * -P <br>
   * Whether NOT use pruning. (default: use pruning)<p>
   *
   * @return an enumeration of all the available options
   */
00304   public Enumeration listOptions() {
    Vector newVector = new Vector(3);
    newVector.addElement(new Option("\tSet number of folds for REP\n" +
                            "\tOne fold is used as pruning set.\n" +
                            "\t(default 3)","F", 1, "-F <number of folds>"));
    newVector.addElement(new Option("\tSet the minimal weights of instances\n" +
                            "\twithin a split.\n" +
                            "\t(default 2.0)","N", 1, "-N <min. weights>"));         
    newVector.addElement(new Option("\tSet the number of runs of\n"+
                            "\toptimizations. (Default: 2)", "O",
                            1,"-O <number of runs>"));
      
    newVector.addElement(new Option("\tSet whether turn on the\n"+
                            "\tdebug mode (Default: false)", "D",
                            0,"-D"));
      
    newVector.addElement(new Option("\tThe seed of randomization\n"+
                            "\t(Default: 1)", "S",
                            1,"-S <seed>"));
      
    newVector.addElement(new Option("\tWhether NOT check the error rate>=0.5\n"
                            +"\tin stopping criteria "
                            +"\t(default: check)", "E", 
                            0, "-E")); 
      
    newVector.addElement(new Option("\tWhether NOT use pruning\n"
                            +"\t(default: use pruning)", "P", 
                            0, "-P")); 
    return newVector.elements();
  }
    
  /**
   * Parses a given list of options. <p/>
   * 
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -F &lt;number of folds&gt;
   *  Set number of folds for REP
   *  One fold is used as pruning set.
   *  (default 3)</pre>
   * 
   * <pre> -N &lt;min. weights&gt;
   *  Set the minimal weights of instances
   *  within a split.
   *  (default 2.0)</pre>
   * 
   * <pre> -O &lt;number of runs&gt;
   *  Set the number of runs of
   *  optimizations. (Default: 2)</pre>
   * 
   * <pre> -D
   *  Set whether turn on the
   *  debug mode (Default: false)</pre>
   * 
   * <pre> -S &lt;seed&gt;
   *  The seed of randomization
   *  (Default: 1)</pre>
   * 
   * <pre> -E
   *  Whether NOT check the error rate&gt;=0.5
   *  in stopping criteria  (default: check)</pre>
   * 
   * <pre> -P
   *  Whether NOT use pruning
   *  (default: use pruning)</pre>
   * 
   <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
00376   public void setOptions(String[] options) throws Exception {
    String numFoldsString = Utils.getOption('F', options);
    if (numFoldsString.length() != 0) 
      m_Folds = Integer.parseInt(numFoldsString);
    else 
      m_Folds = 3;   
      
    String minNoString = Utils.getOption('N', options);
    if (minNoString.length() != 0) 
      m_MinNo = Double.parseDouble(minNoString);
    else 
      m_MinNo = 2.0;
      
    String seedString = Utils.getOption('S', options);
    if (seedString.length() != 0)
      m_Seed = Long.parseLong(seedString);
    else 
      m_Seed = 1;

    String runString = Utils.getOption('O', options);
    if (runString.length() != 0)
      m_Optimizations = Integer.parseInt(runString);
    else 
      m_Optimizations = 2;

    m_Debug = Utils.getFlag('D', options);
    m_CheckErr = !Utils.getFlag('E', options);
    m_UsePruning = !Utils.getFlag('P', options);
  }
    
  /**
   * Gets the current settings of the Classifier.
   *
   * @return an array of strings suitable for passing to setOptions
   */
00411   public String [] getOptions() {

    String [] options = new String [11];
    int current = 0;
    options[current++] = "-F"; options[current++] = "" + m_Folds;
    options[current++] = "-N"; options[current++] = "" + m_MinNo;
    options[current++] = "-O"; options[current++] = "" + m_Optimizations;
    options[current++] = "-S"; options[current++] = "" + m_Seed;
      
    if(m_Debug)
      options[current++] = "-D";

    if(!m_CheckErr)
      options[current++] = "-E";
      
    if(!m_UsePruning)
      options[current++] = "-P";
      
    while(current < options.length)
      options[current++] = "";
      
    return options;
  }

  /**
   * Returns an enumeration of the additional measure names
   * @return an enumeration of the measure names
   */
00439   public Enumeration enumerateMeasures() {
    Vector newVector = new Vector(1);
    newVector.addElement("measureNumRules");
    return newVector.elements();
  }
    
  /**
   * Returns the value of the named measure
   * @param additionalMeasureName the name of the measure to query for its value
   * @return the value of the named measure
   * @throws IllegalArgumentException if the named measure is not supported
   */
00451   public double getMeasure(String additionalMeasureName) {
    if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) 
      return m_Ruleset.size();
    else 
      throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)");
  }  

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
00463   public String foldsTipText() {
    return "Determines the amount of data used for pruning. One fold is used for "
      + "pruning, the rest for growing the rules.";
  }

  /**
   * Sets the number of folds to use
   * 
   * @param fold the number of folds
   */
00473   public void setFolds(int fold) { 
    m_Folds = fold; 
  }
  
  /**
   * Gets the number of folds
   * 
   * @return the number of folds
   */
00482   public int getFolds(){ 
    return m_Folds; 
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
00491   public String minNoTipText() {
    return "The minimum total weight of the instances in a rule.";
  }

  /**
   * Sets the minimum total weight of the instances in a rule
   * 
   * @param m the minimum total weight of the instances in a rule
   */
00500   public void setMinNo(double m) {
    m_MinNo = m;
  }
  
  /**
   * Gets the minimum total weight of the instances in a rule
   * 
   * @return the minimum total weight of the instances in a rule
   */
00509   public double getMinNo(){ 
    return m_MinNo;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
00518   public String seedTipText() {
    return "The seed used for randomizing the data.";
  }

  /**
   * Sets the seed value to use in randomizing the data
   * 
   * @param s the new seed value
   */
00527   public void setSeed(long s) {
    m_Seed = s;
  }
  
  /**
   * Gets the current seed value to use in randomizing the data
   * 
   * @return the seed value
   */
00536   public long getSeed(){
    return m_Seed;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
00545   public String optimizationsTipText() {
    return "The number of optimization runs.";
  }

  /**
   * Sets the number of optimization runs
   * 
   * @param run the number of optimization runs
   */
00554   public void setOptimizations(int run) {
    m_Optimizations = run;
  }
  
  /**
   * Gets the the number of optimization runs
   * 
   * @return the number of optimization runs
   */
00563   public int getOptimizations() {
    return m_Optimizations;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
00572   public String debugTipText() {
    return "Whether debug information is output to the console.";
  }

  /**
   * Sets whether debug information is output to the console
   * 
   * @param d whether debug information is output to the console
   */
00581   public void setDebug(boolean d) {
    m_Debug = d;
  }
  
  /**
   * Gets whether debug information is output to the console
   * 
   * @return whether debug information is output to the console
   */
00590   public boolean getDebug(){
    return m_Debug;
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
00599   public String checkErrorRateTipText() {
    return "Whether check for error rate >= 1/2 is included" +
      " in stopping criterion.";
  }

  /**
   * Sets whether to check for error rate is in stopping criterion
   * 
   * @param d whether to check for error rate is in stopping criterion
   */
00609   public void setCheckErrorRate(boolean d) { 
    m_CheckErr = d;
  }
  
  /**
   * Gets whether to check for error rate is in stopping criterion
   * 
   * @return true if checking for error rate is in stopping criterion
   */
00618   public boolean getCheckErrorRate(){ 
    return m_CheckErr; 
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
00627   public String usePruningTipText() {
    return "Whether pruning is performed.";
  }

  /**
   * Sets whether pruning is performed
   * 
   * @param d Whether pruning is performed
   */
00636   public void setUsePruning(boolean d) { 
    m_UsePruning = d;
  }
  
  /**
   * Gets whether pruning is performed
   * 
   * @return true if pruning is performed
   */
00645   public boolean getUsePruning(){ 
    return m_UsePruning; 
  }
    
  /** 
   * Get the ruleset generated by Ripper 
   *
   * @return the ruleset
   */
00654   public FastVector getRuleset(){ return m_Ruleset; }

  /** 
   * Get the statistics of the ruleset in the given position
   *
   * @param pos the position of the stats, assuming correct
   * @return the statistics of the ruleset in the given position
   */
00662   public RuleStats getRuleStats(int pos) {
    return (RuleStats)m_RulesetStats.elementAt(pos);
  }
    
  /** 
   * The single antecedent in the rule, which is composed of an attribute and 
   * the corresponding value.  There are two inherited classes, namely NumericAntd
   * and NominalAntd in which the attributes are numeric and nominal respectively.
   */    
00671   private abstract class Antd 
    implements WeightedInstancesHandler, Copyable, Serializable, RevisionHandler {

    /** for serialization */
00675     private static final long serialVersionUID = -8929754772994154334L;
      
    /** The attribute of the antecedent */
00678     protected Attribute att;
      
    /** The attribute value of the antecedent.  
       For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */
00682     protected double value; 
      
    /** The maximum infoGain achieved by this antecedent test 
     * in the growing data */
00686     protected double maxInfoGain;
      
    /** The accurate rate of this antecedent test on the growing data */
00689     protected double accuRate;
      
    /** The coverage of this antecedent in the growing data */
00692     protected double cover;
      
    /** The accurate data for this antecedent in the growing data */
00695     protected double accu;
      
    /** 
     * Constructor
     */
00700     public Antd(Attribute a){
      att=a;
      value=Double.NaN; 
      maxInfoGain = 0;
      accuRate = Double.NaN;
      cover = Double.NaN;
      accu = Double.NaN;
    }
      
    /* The abstract members for inheritance */
    public abstract Instances[] splitData(Instances data, double defAcRt, 
                                double cla);
    public abstract boolean covers(Instance inst);
    public abstract String toString();
      
    /** 
     * Implements Copyable
     * 
     * @return a copy of this object
     */
    public abstract Object copy(); 
         
    /* Get functions of this antecedent */
    public Attribute getAttr(){ return att; }
    public double getAttrValue(){ return value; }
    public double getMaxInfoGain(){ return maxInfoGain; }
    public double getAccuRate(){ return accuRate; } 
    public double getAccu(){ return accu; } 
    public double getCover(){ return cover; } 
    
    /**
     * Returns the revision string.
     * 
     * @return          the revision
     */
00735     public String getRevision() {
      return RevisionUtils.extract("$Revision: 1.22 $");
    }
  }
    
  /** 
   * The antecedent with numeric attribute
   */
00743   private class 
    NumericAntd extends Antd {
    
    /** for serialization */
00747     static final long serialVersionUID = 5699457269983735442L;
      
    /** The split point for this numeric antecedent */
00750     private double splitPoint;
    
    /** 
     * Constructor
     */
00755     public NumericAntd(Attribute a){ 
      super(a);
      splitPoint = Double.NaN;
    }    
      
    /** 
     * Get split point of this numeric antecedent
     * 
     * @return the split point of this numeric antecedent
     */
00765     public double getSplitPoint(){ 
      return splitPoint;
    }
      
    /** 
     * Implements Copyable
     * 
     * @return a copy of this object
     */
00774     public Object copy(){ 
      NumericAntd na = new NumericAntd(getAttr());
      na.value = this.value;
      na.splitPoint = this.splitPoint;
      return na;
    }
      
    /**
     * Implements the splitData function.  
     * This procedure is to split the data into two bags according 
     * to the information gain of the numeric attribute value
     * The maximum infoGain is also calculated.  
     * 
     * @param insts the data to be split
     * @param defAcRt the default accuracy rate for data
     * @param cl the class label to be predicted
     * @return the array of data after split
     */
00792     public Instances[] splitData(Instances insts, double defAcRt, 
                         double cl){
      Instances data = insts;
      int total=data.numInstances();// Total number of instances without 
      // missing value for att
          
      int split=1;                  // Current split position
      int prev=0;                   // Previous split position
      int finalSplit=split;         // Final split position
      maxInfoGain = 0;
      value = 0;  
          
      double fstCover=0, sndCover=0, fstAccu=0, sndAccu=0;
          
      data.sort(att);
      // Find the las instance without missing value 
      for(int x=0; x<data.numInstances(); x++){
      Instance inst = data.instance(x);
      if(inst.isMissing(att)){
        total = x;
        break;
      }
            
      sndCover += inst.weight();
      if(Utils.eq(inst.classValue(), cl))
        sndAccu += inst.weight();         
      }         

      if(total == 0) return null; // Data all missing for the attribute
      splitPoint = data.instance(total-1).value(att); 
          
      for(; split <= total; split++){
      if((split == total) ||
         (data.instance(split).value(att) > // Can't split within
          data.instance(prev).value(att))){ // same value       
                
        for(int y=prev; y<split; y++){
          Instance inst = data.instance(y);
          fstCover += inst.weight(); 
          if(Utils.eq(data.instance(y).classValue(), cl)){
            fstAccu += inst.weight();  // First bag positive# ++
          }                
        }
                
        double fstAccuRate = (fstAccu+1.0)/(fstCover+1.0),
          sndAccuRate = (sndAccu+1.0)/(sndCover+1.0);
                
        /* Which bag has higher information gain? */
        boolean isFirst; 
        double fstInfoGain, sndInfoGain;
        double accRate, infoGain, coverage, accurate;
                
        fstInfoGain = 
          //Utils.eq(defAcRt, 1.0) ? 
          //fstAccu/(double)numConds : 
          fstAccu*(Utils.log2(fstAccuRate)-Utils.log2(defAcRt));
                
        sndInfoGain = 
          //Utils.eq(defAcRt, 1.0) ? 
          //sndAccu/(double)numConds : 
          sndAccu*(Utils.log2(sndAccuRate)-Utils.log2(defAcRt));
                
        if(fstInfoGain > sndInfoGain){
          isFirst = true;
          infoGain = fstInfoGain;
          accRate = fstAccuRate;
          accurate = fstAccu;
          coverage = fstCover;
        }
        else{
          isFirst = false;
          infoGain = sndInfoGain;
          accRate = sndAccuRate;
          accurate = sndAccu;
          coverage = sndCover;
        }
                
        /* Check whether so far the max infoGain */
        if(infoGain > maxInfoGain){
          splitPoint = data.instance(prev).value(att);
          value = (isFirst) ? 0 : 1;
          accuRate = accRate;
          accu = accurate;
          cover = coverage;
          maxInfoGain = infoGain;
          finalSplit = (isFirst) ? split : prev;
        }
                
        for(int y=prev; y<split; y++){
          Instance inst = data.instance(y);
          sndCover -= inst.weight(); 
          if(Utils.eq(data.instance(y).classValue(), cl)){
            sndAccu -= inst.weight();  // Second bag positive# --
          }                
        }             
        prev=split;
      }
      }
          
      /* Split the data */
      Instances[] splitData = new Instances[2];
      splitData[0] = new Instances(data, 0, finalSplit);
      splitData[1] = new Instances(data, finalSplit, total-finalSplit);
          
      return splitData;
    }
      
    /**
     * Whether the instance is covered by this antecedent
     * 
     * @param inst the instance in question
     * @return the boolean value indicating whether the instance is covered
     *         by this antecedent
     */
00906     public boolean covers(Instance inst){
      boolean isCover=true;
      if(!inst.isMissing(att)){
      if((int)value == 0){ // First bag
        if(inst.value(att) > splitPoint)
          isCover=false;
      }
      else if(inst.value(att) < splitPoint) // Second bag
        isCover=false;
      }
      else
      isCover = false;
          
      return isCover;
    }
      
    /**
     * Prints this antecedent
     *
     * @return a textual description of this antecedent
     */
00927     public String toString() {
      String symbol = ((int)value == 0) ? " <= " : " >= ";
      return (att.name() + symbol + Utils.doubleToString(splitPoint, 6));
    }   
    
    /**
     * Returns the revision string.
     * 
     * @return          the revision
     */
00937     public String getRevision() {
      return RevisionUtils.extract("$Revision: 1.22 $");
    }
  }
    
    
  /** 
   * The antecedent with nominal attribute
   */
00946   private class NominalAntd 
    extends Antd {
      
    /** for serialization */
00950     static final long serialVersionUID = -9102297038837585135L;
    
    /* The parameters of infoGain calculated for each attribute value
     * in the growing data */
    private double[] accurate;
    private double[] coverage;
      
    /** 
     * Constructor
     */
00960     public NominalAntd(Attribute a){ 
      super(a);    
      int bag = att.numValues();
      accurate = new double[bag];
      coverage = new double[bag];
    }   

    /** 
     * Implements Copyable
     * 
     * @return a copy of this object
     */
00972     public Object copy(){
      Antd antec = new NominalAntd(getAttr());
      antec.value = this.value;
      return antec;         
    }
      
    /**
     * Implements the splitData function.  
     * This procedure is to split the data into bags according 
     * to the nominal attribute value
     * The infoGain for each bag is also calculated.  
     * 
     * @param data the data to be split
     * @param defAcRt the default accuracy rate for data
     * @param cl the class label to be predicted
     * @return the array of data after split
     */
00989     public Instances[] splitData(Instances data, double defAcRt, 
                         double cl){
      int bag = att.numValues();
      Instances[] splitData = new Instances[bag];
          
      for(int x=0; x<bag; x++){
      splitData[x] = new Instances(data, data.numInstances());
      accurate[x] = 0;
      coverage[x] = 0;
      }
          
      for(int x=0; x<data.numInstances(); x++){
      Instance inst=data.instance(x);
      if(!inst.isMissing(att)){
        int v = (int)inst.value(att);
        splitData[v].add(inst);
        coverage[v] += inst.weight();
        if((int)inst.classValue() == (int)cl)
          accurate[v] += inst.weight();
      }
      }
          
      for(int x=0; x<bag; x++){
      double t = coverage[x]+1.0;
      double p = accurate[x] + 1.0;       
      double infoGain = 
        //Utils.eq(defAcRt, 1.0) ? 
        //accurate[x]/(double)numConds : 
        accurate[x]*(Utils.log2(p/t)-Utils.log2(defAcRt));
            
      if(infoGain > maxInfoGain){
        maxInfoGain = infoGain;
        cover = coverage[x];
        accu = accurate[x];
        accuRate = p/t;
        value = (double)x;
      }
      }
          
      return splitData;
    }
      
    /**
     * Whether the instance is covered by this antecedent
     * 
     * @param inst the instance in question
     * @return the boolean value indicating whether the instance is
     *         covered by this antecedent
     */
01038     public boolean covers(Instance inst){
      boolean isCover=false;
      if(!inst.isMissing(att)){
      if((int)inst.value(att) == (int)value)
        isCover=true;       
      }
      return isCover;
    }
      
    /**
     * Prints this antecedent
     *
     * @return a textual description of this antecedent
     */
01052     public String toString() {
      return (att.name() + " = " +att.value((int)value));
    } 
    
    /**
     * Returns the revision string.
     * 
     * @return          the revision
     */
01061     public String getRevision() {
      return RevisionUtils.extract("$Revision: 1.22 $");
    }
  }

    
  /**
   * This class implements a single rule that predicts specified class.  
   *
   * A rule consists of antecedents "AND"ed together and the consequent 
   * (class value) for the classification.  
   * In this class, the Information Gain (p*[log(p/t) - log(P/T)]) is used to
   * select an antecedent and Reduced Error Prunning (REP) with the metric
   * of accuracy rate p/(p+n) or (TP+TN)/(P+N) is used to prune the rule. 
   */    
01076   protected class RipperRule 
    extends Rule {
    
    /** for serialization */
01080     static final long serialVersionUID = -2410020717305262952L;
      
    /** The internal representation of the class label to be predicted */
01083     private double m_Consequent = -1;     
            
    /** The vector of antecedents of this rule*/
01086     protected FastVector m_Antds = null;  
      
    /** Constructor */
01089     public RipperRule(){    
      m_Antds = new FastVector();   
    }
      
    /**
     * Sets the internal representation of the class label to be predicted
     * 
     * @param cl the internal representation of the class label to be predicted
     */
01098     public void setConsequent(double cl) {
      m_Consequent = cl; 
    }
    
    /**
     * Gets the internal representation of the class label to be predicted
     * 
     * @return the internal representation of the class label to be predicted
     */
01107     public double getConsequent() { 
      return m_Consequent; 
    }
      
    /**
     * Get a shallow copy of this rule
     *
     * @return the copy
     */
01116     public Object copy(){
      RipperRule copy = new RipperRule();
      copy.setConsequent(getConsequent());
      copy.m_Antds = (FastVector)this.m_Antds.copyElements();
      return copy;
    }
      
    /**
     * Whether the instance covered by this rule
     * 
     * @param datum the instance in question
     * @return the boolean value indicating whether the instance 
     *         is covered by this rule
     */
01130     public boolean covers(Instance datum){
      boolean isCover=true;
          
      for(int i=0; i<m_Antds.size(); i++){
      Antd antd = (Antd)m_Antds.elementAt(i);
      if(!antd.covers(datum)){
        isCover = false;
        break;
      }
      }
          
      return isCover;
    }        
      
    /**
     * Whether this rule has antecedents, i.e. whether it is a default rule
     * 
     * @return the boolean value indicating whether the rule has antecedents
     */
01149     public boolean hasAntds(){
      if (m_Antds == null)
      return false;
      else
      return (m_Antds.size() > 0);
    }      
      
    /** 
     * the number of antecedents of the rule
     *
     * @return the size of this rule
     */
01161     public double size(){ return (double)m_Antds.size(); }        

      
    /**
     * Private function to compute default number of accurate instances
     * in the specified data for the consequent of the rule
     * 
     * @param data the data in question
     * @return the default accuracy number
     */
01171     private double computeDefAccu(Instances data){ 
      double defAccu=0;
      for(int i=0; i<data.numInstances(); i++){
      Instance inst = data.instance(i);
      if((int)inst.classValue() == (int)m_Consequent)
        defAccu += inst.weight();
      }
      return defAccu;
    }
      
      
    /**
     * Build one rule using the growing data
     *
     * @param data the growing data used to build the rule
     * @throws Exception if the consequent is not set yet
     */    
01188       public void grow(Instances data) throws Exception {
      if(m_Consequent == -1)
      throw new Exception(" Consequent not set yet.");
          
      Instances growData = data;             
      double sumOfWeights = growData.sumOfWeights();
      if(!Utils.gr(sumOfWeights, 0.0))
      return;
          
      /* Compute the default accurate rate of the growing data */
      double defAccu = computeDefAccu(growData);
      double defAcRt = (defAccu+1.0)/(sumOfWeights+1.0); 
          
      /* Keep the record of which attributes have already been used*/    
      boolean[] used=new boolean [growData.numAttributes()];
      for (int k=0; k<used.length; k++)
      used[k]=false;
      int numUnused=used.length;
          
      // If there are already antecedents existing
      for(int j=0; j < m_Antds.size(); j++){
      Antd antdj = (Antd)m_Antds.elementAt(j);
      if(!antdj.getAttr().isNumeric()){ 
        used[antdj.getAttr().index()]=true;
        numUnused--;
      } 
      }         
          
      double maxInfoGain;         
      while (Utils.gr(growData.numInstances(), 0.0) && 
           (numUnused > 0) 
           && Utils.sm(defAcRt, 1.0)
           ){   
            
      // We require that infoGain be positive
      /*if(numAntds == originalSize)
        maxInfoGain = 0.0; // At least one condition allowed
        else
        maxInfoGain = Utils.eq(defAcRt, 1.0) ? 
        defAccu/(double)numAntds : 0.0; */
      maxInfoGain = 0.0; 
            
      /* Build a list of antecedents */
      Antd oneAntd=null;
      Instances coverData = null;
      Enumeration enumAttr=growData.enumerateAttributes();        
            
      /* Build one condition based on all attributes not used yet*/
      while (enumAttr.hasMoreElements()){
        Attribute att= (Attribute)(enumAttr.nextElement());
        
        if(m_Debug)
          System.err.println("\nOne condition: size = " 
                         + growData.sumOfWeights());
               
        Antd antd =null;      
        if(att.isNumeric())
          antd = new NumericAntd(att);
        else
          antd = new NominalAntd(att);
                
        if(!used[att.index()]){
          /* Compute the best information gain for each attribute,
             it's stored in the antecedent formed by this attribute.
             This procedure returns the data covered by the antecedent*/
          Instances coveredData = computeInfoGain(growData, defAcRt,
                                        antd);
          if(coveredData != null){
            double infoGain = antd.getMaxInfoGain();      
            if(m_Debug)
            System.err.println("Test of \'"+antd.toString()+
                           "\': infoGain = "+
                           infoGain + " | Accuracy = " +
                           antd.getAccuRate()+
                           "="+antd.getAccu()
                           +"/"+antd.getCover()+
                           " def. accuracy: "+defAcRt);
                      
            if(infoGain > maxInfoGain){         
            oneAntd=antd;
            coverData = coveredData;  
            maxInfoGain = infoGain;
            }               
          }
        }
      }
            
      if(oneAntd == null) break; // Cannot find antds       
      if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage
            
      //Numeric attributes can be used more than once
      if(!oneAntd.getAttr().isNumeric()){ 
        used[oneAntd.getAttr().index()]=true;
        numUnused--;
      }
            
      m_Antds.addElement(oneAntd);
      growData = coverData;// Grow data size is shrinking   
      defAcRt = oneAntd.getAccuRate();
      }
    }
      
      
    /** 
     * Compute the best information gain for the specified antecedent
     *  
     * @param instances the data based on which the infoGain is computed
     * @param defAcRt the default accuracy rate of data
     * @param antd the specific antecedent
     * @param numConds the number of antecedents in the rule so far
     * @return the data covered by the antecedent
     */
01300     private Instances computeInfoGain(Instances instances, double defAcRt, 
                              Antd antd){
      Instances data = instances;
      
      /* Split the data into bags.
       The information gain of each bag is also calculated in this procedure */
      Instances[] splitData = antd.splitData(data, defAcRt, 
                                   m_Consequent); 
          
      /* Get the bag of data to be used for next antecedents */
      if(splitData != null)
      return splitData[(int)antd.getAttrValue()];
      else return null;
    }
      
    /**
     * Prune all the possible final sequences of the rule using the 
     * pruning data.  The measure used to prune the rule is based on
     * flag given.
     *
     * @param pruneData the pruning data used to prune the rule
     * @param useWhole flag to indicate whether use the error rate of
     *                 the whole pruning data instead of the data covered
     */    
01324     public void prune(Instances pruneData, boolean useWhole){
      Instances data = pruneData;
      
      double total = data.sumOfWeights();
      if(!Utils.gr(total, 0.0))
      return;
      
      /* The default accurate # and rate on pruning data */
      double defAccu=computeDefAccu(data);
          
      if(m_Debug) 
      System.err.println("Pruning with " + defAccu + 
                     " positive data out of " + total +
                     " instances"); 
          
      int size=m_Antds.size();
      if(size == 0) return; // Default rule before pruning
          
      double[] worthRt = new double[size];
      double[] coverage = new double[size];
      double[] worthValue = new double[size];
      for(int w=0; w<size; w++){
      worthRt[w]=coverage[w]=worthValue[w]=0.0;
      }
          
      /* Calculate accuracy parameters for all the antecedents in this rule */
      double tn = 0.0; // True negative if useWhole
      for(int x=0; x<size; x++){
      Antd antd=(Antd)m_Antds.elementAt(x);
      Instances newData = data;
      data = new Instances(newData, 0); // Make data empty
            
      for(int y=0; y<newData.numInstances(); y++){
        Instance ins=newData.instance(y);
                
        if(antd.covers(ins)){   // Covered by this antecedent
          coverage[x] += ins.weight();
          data.add(ins);                 // Add to data for further pruning
          if((int)ins.classValue() == (int)m_Consequent) // Accurate prediction
            worthValue[x] += ins.weight();
        }
        else if(useWhole){ // Not covered
          if((int)ins.classValue() != (int)m_Consequent)
            tn += ins.weight();
        }               
      }
            
      if(useWhole){
        worthValue[x] += tn;
        worthRt[x] = worthValue[x] / total;
      }
      else // Note if coverage is 0, accuracy is 0.5
        worthRt[x] = (worthValue[x]+1.0)/(coverage[x]+2.0);
      }
          
      double maxValue = (defAccu+1.0)/(total+2.0);
      int maxIndex = -1;
      for(int i=0; i<worthValue.length; i++){
      if(m_Debug){
        double denom = useWhole ? total : coverage[i];
        System.err.println(i+"(useAccuray? "+!useWhole+"): "
                       + worthRt[i] + 
                       "="+worthValue[i]+
                       "/"+denom);
      }
      if(worthRt[i] > maxValue){ // Prefer to the 
        maxValue = worthRt[i]; // shorter rule
        maxIndex = i;
      }
      }
          
      /* Prune the antecedents according to the accuracy parameters */
      for(int z=size-1;z>maxIndex;z--)
      m_Antds.removeElementAt(z);       
    }
      
    /**
     * Prints this rule
     *
     * @param classAttr the class attribute in the data
     * @return a textual description of this rule
     */
01406     public String toString(Attribute classAttr) {
      StringBuffer text =  new StringBuffer();
      if(m_Antds.size() > 0){
      for(int j=0; j< (m_Antds.size()-1); j++)
        text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and ");
      text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")");
      }
      text.append(" => " + classAttr.name() +
              "=" + classAttr.value((int)m_Consequent));
          
      return text.toString();
    }
    
    /**
     * Returns the revision string.
     * 
     * @return          the revision
     */
01424     public String getRevision() {
      return RevisionUtils.extract("$Revision: 1.22 $");
    }
  }

  /**
   * Returns default capabilities of the classifier.
   *
   * @return      the capabilities of this classifier
   */
01434   public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.DATE_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    // class
    result.enable(Capability.NOMINAL_CLASS);
    result.enable(Capability.MISSING_CLASS_VALUES);
   
    // instances
    result.setMinimumNumberInstances(m_Folds);
    
    return result;
  }
    
  /**
   * Builds Ripper in the order of class frequencies.  For each class
   * it's built in two stages: building and optimization 
   *
   * @param instances the training data
   * @throws Exception if classifier can't be built successfully
   */
01460   public void buildClassifier(Instances instances) throws Exception {
     
    // can classifier handle the data?
    getCapabilities().testWithFail(instances);

    // remove instances with missing class
    instances = new Instances(instances);
    instances.deleteWithMissingClass();
    
    m_Random = instances.getRandomNumberGenerator(m_Seed); 
    m_Total = RuleStats.numAllConditions(instances);
    if(m_Debug)
      System.err.println("Number of all possible conditions = "+m_Total);
    
    Instances data = null;
    m_Filter = new ClassOrder();
    ((ClassOrder)m_Filter).setSeed(m_Random.nextInt());     
    ((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND);
    m_Filter.setInputFormat(instances);
    data = Filter.useFilter(instances, m_Filter);
      
    if(data == null)
      throw new Exception(" Unable to randomize the class orders.");
    
    m_Class = data.classAttribute();      
    m_Ruleset = new FastVector();
    m_RulesetStats = new FastVector();
    m_Distributions = new FastVector();

    // Sort by classes frequency
    double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts();
    if(m_Debug){
      System.err.println("Sorted classes:");
      for(int x=0; x < m_Class.numValues(); x++)
      System.err.println(x+": "+m_Class.value(x) + " has " +
                     orderedClasses[x] + " instances.");
    }
    // Iterate from less prevalent class to more frequent one
  oneClass: 
    for(int y=0; y < data.numClasses()-1; y++){ // For each class       
          
      double classIndex = (double)y;
      if(m_Debug){
      int ci = (int)classIndex;
      System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): "
                     + orderedClasses[y] + "instances\n"+
                     "=====================================\n");
      }
            
      if(Utils.eq(orderedClasses[y],0.0)) // No data for this class
      continue oneClass;            
          
      // The expected FP/err is the proportion of the class
      double all = 0;
      for(int i=y; i<orderedClasses.length; i++)
      all += orderedClasses[i];
      double expFPRate = orderedClasses[y] / all;         
            
      double classYWeights = 0, totalWeights = 0;
      for(int j=0; j < data.numInstances(); j++){
        Instance datum = data.instance(j);
        totalWeights += datum.weight();
        if((int)datum.classValue() == y){
            classYWeights += datum.weight();
        }             
      }     
          
      // DL of default rule, no theory DL, only data DL
      double defDL;
      if(classYWeights > 0)
        defDL = RuleStats.dataDL(expFPRate, 
                           0.0,
                           totalWeights,
                           0.0,
                           classYWeights);          
      else
        continue oneClass; // Subsumed by previous rules

      if(Double.isNaN(defDL) || Double.isInfinite(defDL))
      throw new Exception("Should never happen: "+
                      "defDL NaN or infinite!");
      if(m_Debug)
      System.err.println("The default DL = "+defDL);
          
      data = rulesetForOneClass(expFPRate, data, classIndex, defDL);
    }

    // Set the default rule
    RipperRule defRule = new RipperRule();
    defRule.setConsequent((double)(data.numClasses()-1));
    m_Ruleset.addElement(defRule);
      
    RuleStats defRuleStat = new RuleStats();
    defRuleStat.setData(data);
    defRuleStat.setNumAllConds(m_Total);
    defRuleStat.addAndUpdate(defRule);
    m_RulesetStats.addElement(defRuleStat);

    for(int z=0; z < m_RulesetStats.size(); z++){
      RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z);
      for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){
          double[] classDist = oneClass.getDistributions(xyz);
          Utils.normalize(classDist);
          if(classDist != null)
            m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist));
      }     
    }    
  }
    
  /**
   * Classify the test instance with the rule learner and provide
   * the class distributions 
   *
   * @param datum the instance to be classified
   * @return the distribution
   */
01576     public double[] distributionForInstance(Instance datum){
      try{
          for(int i=0; i < m_Ruleset.size(); i++){
            RipperRule rule = (RipperRule)m_Ruleset.elementAt(i);
            if(rule.covers(datum))
                return (double[])m_Distributions.elementAt(i); 
          }
      }catch(Exception e){
          System.err.println(e.getMessage());
          e.printStackTrace();
      }
      
      System.err.println("Should never happen!");
      return new double[datum.classAttribute().numValues()];
    }

  /** Build a ruleset for the given class according to the given data
   *
   * @param expFPRate the expected FP/(FP+FN) used in DL calculation
   * @param data the given data
   * @param classIndex the given class index
   * @param defDL the default DL in the data
   * @throws Exception if the ruleset can be built properly
   */
01600   protected Instances rulesetForOneClass(double expFPRate, 
                               Instances data, 
                               double classIndex,
                               double defDL)
    throws Exception {
      
    Instances newData = data, growData, pruneData;    
    boolean stop = false;
    FastVector ruleset = new FastVector();            
      
    double dl = defDL, minDL = defDL;
    RuleStats rstats = null;
    double[] rst;
      
    // Check whether data have positive examples
    boolean defHasPositive = true; // No longer used
    boolean hasPositive = defHasPositive;
      
    /********************** Building stage ***********************/     
    if(m_Debug)
      System.err.println("\n*** Building stage ***");
    
    while((!stop) && hasPositive){ // Generate new rules until
      // stopping criteria met
      RipperRule oneRule;
      if(m_UsePruning){       
      /* Split data into Grow and Prune*/

      // We should have stratified the data, but ripper seems
      // to have a bug that makes it not to do so.  In order
      // to simulate it more precisely, we do the same thing.
      //newData.randomize(m_Random);      
      newData = RuleStats.stratify(newData, m_Folds, m_Random);         
      Instances[] part = RuleStats.partition(newData, m_Folds);
      growData=part[0];
      pruneData=part[1];
      //growData=newData.trainCV(m_Folds, m_Folds-1);
      //pruneData=newData.testCV(m_Folds, m_Folds-1); 
            
      oneRule = new RipperRule();
      oneRule.setConsequent(classIndex);  // Must set first
            
      if(m_Debug)
        System.err.println("\nGrowing a rule ...");  
      oneRule.grow(growData);             // Build the rule
      if(m_Debug)
        System.err.println("One rule found before pruning:"+
                       oneRule.toString(m_Class));
            
      if(m_Debug)
        System.err.println("\nPruning the rule ...");  
      oneRule.prune(pruneData, false);    // Prune the rule
      if(m_Debug)
        System.err.println("One rule found after pruning:"+
                       oneRule.toString(m_Class));
      }
      else{
      oneRule = new RipperRule();
      oneRule.setConsequent(classIndex);  // Must set first
      if(m_Debug)
        System.err.println("\nNo pruning: growing a rule ...");
      oneRule.grow(newData);             // Build the rule
      if(m_Debug)
        System.err.println("No pruning: one rule found:\n"+
                       oneRule.toString(m_Class));
      }
          
      // Compute the DL of this ruleset
      if(rstats == null){ // First rule
      rstats = new RuleStats();
      rstats.setNumAllConds(m_Total);
      rstats.setData(newData);
      }
          
      rstats.addAndUpdate(oneRule);           
      int last = rstats.getRuleset().size()-1; // Index of last rule
      dl += rstats.relativeDL(last, expFPRate, m_CheckErr);
          
      if(Double.isNaN(dl) || Double.isInfinite(dl))
      throw new Exception("Should never happen: dl in "+
                      "building stage NaN or infinite!");
      if(m_Debug)
      System.err.println("Before optimization("+last+
                     "): the dl = "+dl+" | best: "+minDL);
          
      if(dl < minDL)
      minDL = dl;  // The best dl so far  
          
      rst = rstats.getSimpleStats(last);      
      if(m_Debug)
      System.err.println("The rule covers: "+rst[0]+
                     " | pos = " + rst[2] + 
                     " | neg = " + rst[4]+
                     "\nThe rule doesn't cover: "+rst[1]+
                     " | pos = " + rst[5]);
          
      stop = checkStop(rst, minDL, dl);
          
      if(!stop){              
      ruleset.addElement(oneRule);          // Accepted 
      newData = rstats.getFiltered(last)[1];// Data not covered
      hasPositive = Utils.gr(rst[5], 0.0);  // Positives remaining?
      if(m_Debug)
        System.err.println("One rule added: has positive? "
                       +hasPositive);
      }
      else{
      if(m_Debug)
        System.err.println("Quit rule");
      rstats.removeLast(); // Remove last to be re-used
      }
    }// while !stop     
      
    /******************** Optimization stage *******************/
    RuleStats finalRulesetStat = null;
    if(m_UsePruning){    
      for(int z=0; z < m_Optimizations; z++){
      if(m_Debug)
        System.err.println("\n*** Optimization: run #"
                       +z+" ***");
            
      newData = data;             
      finalRulesetStat = new RuleStats();
      finalRulesetStat.setData(newData);
      finalRulesetStat.setNumAllConds(m_Total);
      int position=0;
      stop = false;
      boolean isResidual = false;       
      hasPositive = defHasPositive;           
      dl = minDL = defDL;
            
      oneRule:    
      while(!stop && hasPositive){              
                
        isResidual = (position>=ruleset.size()); // Cover residual positive examples  
        // Re-do shuffling and stratification    
        //newData.randomize(m_Random);    
        newData = RuleStats.stratify(newData, m_Folds, m_Random);
        Instances[] part = RuleStats.partition(newData, m_Folds);
        growData=part[0];
        pruneData=part[1];
        //growData=newData.trainCV(m_Folds, m_Folds-1);
        //pruneData=newData.testCV(m_Folds, m_Folds-1);        
        RipperRule finalRule;
                
        if(m_Debug)
          System.err.println("\nRule #"+position +
                         "| isResidual?" + isResidual+
                         "| data size: "+newData.sumOfWeights());
                
        if(isResidual){
          RipperRule newRule = new RipperRule();   
          newRule.setConsequent(classIndex);
          if(m_Debug)
            System.err.println("\nGrowing and pruning"+
                         " a new rule ..."); 
          newRule.grow(growData);
          newRule.prune(pruneData, false);
          finalRule = newRule;
          if(m_Debug)
            System.err.println("\nNew rule found: "+
                         newRule.toString(m_Class));
        }
        else{
          RipperRule oldRule = (RipperRule)ruleset.elementAt(position);
          boolean covers = false;
          // Test coverage of the next old rule
          for(int i=0; i<newData.numInstances(); i++)
            if(oldRule.covers(newData.instance(i))){
            covers = true;
            break;
            }
                  
          if(!covers){// Null coverage, no variants can be generated
            finalRulesetStat.addAndUpdate(oldRule);
            position++;
            continue oneRule;
          }  
                  
          // 2 variants 
          if(m_Debug)
            System.err.println("\nGrowing and pruning"+
                         " Replace ..."); 
          RipperRule replace = new RipperRule();   
          replace.setConsequent(classIndex);
          replace.grow(growData);
                  
          // Remove the pruning data covered by the following
          // rules, then simply compute the error rate of the
          // current rule to prune it.  According to Ripper,
          // it's equivalent to computing the error of the 
          // whole ruleset -- is it true?
          pruneData = RuleStats.rmCoveredBySuccessives(pruneData,ruleset, position);            
          replace.prune(pruneData, true);
                  
          if(m_Debug)
            System.err.println("\nGrowing and pruning"+
                         " Revision ..."); 
          RipperRule revision = (RipperRule)oldRule.copy(); 
                  
          // For revision, first rm the data covered by the old rule
          Instances newGrowData = new Instances(growData, 0);
          for(int b=0; b<growData.numInstances(); b++){
            Instance inst = growData.instance(b);
            if(revision.covers(inst))
            newGrowData.add(inst);
          }
          revision.grow(newGrowData);           
          revision.prune(pruneData, true);
                  
          double[][] prevRuleStats = new double[position][6];
          for(int c=0; c < position; c++)
            prevRuleStats[c] = finalRulesetStat.getSimpleStats(c);

          // Now compare the relative DL of variants
          FastVector tempRules = (FastVector)ruleset.copyElements();
          tempRules.setElementAt(replace, position);
                  
          RuleStats repStat = new RuleStats(data, tempRules);
          repStat.setNumAllConds(m_Total);
          repStat.countData(position, newData, prevRuleStats);
          //repStat.countData();
          rst = repStat.getSimpleStats(position);         
          if(m_Debug)
            System.err.println("Replace rule covers: "+rst[0]+
                         " | pos = " + rst[2] + 
                         " | neg = " + rst[4]+
                         "\nThe rule doesn't cover: "+rst[1]+
                         " | pos = " + rst[5]);
                  
          double repDL = repStat.relativeDL(position, expFPRate,
                                    m_CheckErr);
          if(m_Debug)
            System.err.println("\nReplace: "+
                         replace.toString(m_Class)
                         +" |dl = "+repDL); 
                  
          if(Double.isNaN(repDL) || Double.isInfinite(repDL))
            throw new Exception("Should never happen: repDL"+
                          "in optmz. stage NaN or "+
                          "infinite!");
                  
          tempRules.setElementAt(revision, position);
          RuleStats revStat = new RuleStats(data, tempRules);
          revStat.setNumAllConds(m_Total);
          revStat.countData(position, newData, prevRuleStats);
          //revStat.countData();
          double revDL = revStat.relativeDL(position, expFPRate,
                                    m_CheckErr);
                  
          if(m_Debug)
            System.err.println("Revision: "
                         + revision.toString(m_Class)
                         +" |dl = "+revDL);
                  
          if(Double.isNaN(revDL) || Double.isInfinite(revDL))
            throw new Exception("Should never happen: revDL"+
                          "in optmz. stage NaN or "+
                          "infinite!");
                  
          rstats = new RuleStats(data, ruleset);
          rstats.setNumAllConds(m_Total);
          rstats.countData(position, newData, prevRuleStats);
          //rstats.countData();
          double oldDL = rstats.relativeDL(position, expFPRate,
                                   m_CheckErr);
                  
          if(Double.isNaN(oldDL) || Double.isInfinite(oldDL))
            throw new Exception("Should never happen: oldDL"+
                          "in optmz. stage NaN or "+
                          "infinite!");
          if(m_Debug)
            System.err.println("Old rule: "+
                         oldRule.toString(m_Class)
                         +" |dl = "+oldDL); 
                  
          if(m_Debug)
            System.err.println("\nrepDL: "+repDL+ 
                         "\nrevDL: "+revDL+
                         "\noldDL: "+oldDL);
                  
          if((oldDL <= revDL) && (oldDL <= repDL))
            finalRule = oldRule; // Old the best
          else if(revDL <= repDL)
            finalRule = revision; // Revision the best
          else
            finalRule = replace; // Replace the best  
        }         
                
        finalRulesetStat.addAndUpdate(finalRule);      
        rst = finalRulesetStat.getSimpleStats(position);
                
        if(isResidual){
                  
          dl += finalRulesetStat.relativeDL(position, 
                                    expFPRate,
                                    m_CheckErr);
          if(m_Debug)
            System.err.println("After optimization: the dl"
                         +"="+dl+" | best: "+minDL);
                  
          if(dl < minDL)
            minDL = dl;  // The best dl so far
                  
          stop = checkStop(rst, minDL, dl);
          if(!stop)
            ruleset.addElement(finalRule); // Accepted 
          else{
            finalRulesetStat.removeLast(); // Remove last to be re-used
            position--;
          }
        }
        else
          ruleset.setElementAt(finalRule, position); // Accepted 

        if(m_Debug){
          System.err.println("The rule covers: "+rst[0]+
                         " | pos = " + rst[2] + 
                         " | neg = " + rst[4]+
                         "\nThe rule doesn't cover: "+rst[1]+
                         " | pos = " + rst[5]);       
          System.err.println("\nRuleset so far: ");
          for(int x=0; x<ruleset.size(); x++)
            System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
          System.err.println();
        }
                
        //Data not covered    
        if(finalRulesetStat.getRulesetSize() > 0)// If any rules  
          newData = finalRulesetStat.getFiltered(position)[1]; 
        hasPositive = Utils.gr(rst[5], 0.0); //Positives remaining? 
        position++;
      } // while !stop && hasPositive
            
      if(ruleset.size() > (position+1)){ // Hasn't gone through yet
        for(int k=position+1; k<ruleset.size(); k++)
          finalRulesetStat.addAndUpdate((Rule)ruleset.elementAt(k));
      }
      if(m_Debug)
        System.err.println("\nDeleting rules to decrease"+
                       " DL of the whole ruleset ..."); 
      finalRulesetStat.reduceDL(expFPRate, m_CheckErr);
      if(m_Debug){
        int del = ruleset.size() -
          finalRulesetStat.getRulesetSize(); 
        System.err.println(del+" rules are deleted"+
                       " after DL reduction procedure");
      }
      ruleset = finalRulesetStat.getRuleset();
      rstats = finalRulesetStat;                    
            
      } // For each run of optimization
    } // if pruning is used
      
    // Concatenate the ruleset for this class to the whole ruleset
    if(m_Debug){
      System.err.println("\nFinal ruleset: ");
      for(int x=0; x<ruleset.size(); x++)
      System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
      System.err.println();
    }
      
    m_Ruleset.appendElements(ruleset);
    m_RulesetStats.addElement(rstats);
      
    if(ruleset.size() > 0)// If any rules for this class
      return rstats.getFiltered(ruleset.size()-1)[1]; // Data not 
    else                                                // covered
      return data; 
  }   
    
  /**
   * Check whether the stopping criterion meets
   *
   * @param rst the statistic of the ruleset
   * @param minDL the min description length so far
   * @param dl the current description length of the ruleset
   * @return true if stop criterion meets, false otherwise
   */
01979   private boolean checkStop(double[] rst, double minDL, double dl){
      
    if(dl > minDL+MAX_DL_SURPLUS){
      if(m_Debug)
      System.err.println("DL too large: "+dl+" | "+minDL);
      return true;
    }
    else if(!Utils.gr(rst[2], 0.0)){// Covered positives
      if(m_Debug)
      System.err.println("Too few positives.");
      return true;
    } 
    else if((rst[4]/rst[0]) >= 0.5){// Err rate
      if(m_CheckErr){
      if(m_Debug)
        System.err.println("Error too large: "+
                       rst[4] + "/" + rst[0]);
      return  true;
      }
      else
      return false;
    }       
    else{// Not stops
      if(m_Debug)
      System.err.println("Continue.");
      return  false;
    }                   
  }
 
  /**
   * Prints the all the rules of the rule learner.
   *
   * @return a textual description of the classifier
   */
02013   public String toString() {
    if (m_Ruleset == null) 
      return "JRIP: No model built yet.";
      
    StringBuffer sb = new StringBuffer("JRIP rules:\n"+
                               "===========\n\n"); 
    for(int j=0; j<m_RulesetStats.size(); j++){
      RuleStats rs = (RuleStats)m_RulesetStats.elementAt(j);
      FastVector rules = rs.getRuleset();
      for(int k=0; k<rules.size(); k++){
      double[] simStats = rs.getSimpleStats(k);
      sb.append(((RipperRule)rules.elementAt(k)).toString(m_Class)
              + " ("+simStats[0]+"/"+simStats[4]+")\n");
      }                     
    }
    if(m_Debug){
      System.err.println("Inside m_Ruleset");
      for(int i=0; i<m_Ruleset.size(); i++)
      System.err.println(((RipperRule)m_Ruleset.elementAt(i)).toString(m_Class));
    }
    sb.append("\nNumber of Rules : " 
            + m_Ruleset.size() + "\n");
    return sb.toString();
  }
  
  /**
   * Returns the revision string.
   * 
   * @return            the revision
   */
02043   public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.22 $");
  }
    
  /**
   * Main method.
   *
   * @param args the options for the classifier
   */
02052   public static void main(String[] args) {      
    runClassifier(new JRip(), args);
  } 
}

Generated by  Doxygen 1.6.0   Back to index