Logo Search packages:      
Sourcecode: weka version File versions

XMeans.java

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    XMeans.java
 *    Copyright (C) 2000 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import weka.core.AlgVector;
import weka.core.Capabilities;
import weka.core.DistanceFunction;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionUtils;
import weka.core.neighboursearch.KDTree;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

/**
 <!-- globalinfo-start -->
 * Cluster data using the X-means algorithm.<br/>
 * <br/>
 * X-Means is K-Means extended by an Improve-Structure part In this part of the algorithm the centers are attempted to be split in its region. The decision between the children of each center and itself is done comparing the BIC-values of the two structures.<br/>
 * <br/>
 * For more information see:<br/>
 * <br/>
 * Dan Pelleg, Andrew W. Moore: X-means: Extending K-means with Efficient Estimation of the Number of Clusters. In: Seventeenth International Conference on Machine Learning, 727-734, 2000.
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- technical-bibtex-start -->
 * BibTeX:
 * <pre>
 * &#64;inproceedings{Pelleg2000,
 *    author = {Dan Pelleg and Andrew W. Moore},
 *    booktitle = {Seventeenth International Conference on Machine Learning},
 *    pages = {727-734},
 *    publisher = {Morgan Kaufmann},
 *    title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters},
 *    year = {2000}
 * }
 * </pre>
 * <p/>
 <!-- technical-bibtex-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -I &lt;num&gt;
 *  maximum number of overall iterations
 *  (default 1).</pre>
 * 
 * <pre> -M &lt;num&gt;
 *  maximum number of iterations in the kMeans loop in
 *  the Improve-Parameter part 
 *  (default 1000).</pre>
 * 
 * <pre> -J &lt;num&gt;
 *  maximum number of iterations in the kMeans loop
 *  for the splitted centroids in the Improve-Structure part 
 *  (default 1000).</pre>
 * 
 * <pre> -L &lt;num&gt;
 *  minimum number of clusters
 *  (default 2).</pre>
 * 
 * <pre> -H &lt;num&gt;
 *  maximum number of clusters
 *  (default 4).</pre>
 * 
 * <pre> -B &lt;value&gt;
 *  distance value for binary attributes
 *  (default 1.0).</pre>
 * 
 * <pre> -use-kdtree
 *  Uses the KDTree internally
 *  (default no).</pre>
 * 
 * <pre> -K &lt;KDTree class specification&gt;
 *  Full class name of KDTree class to use, followed
 *  by scheme options.
 *  eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
 *  (default no KDTree class used).</pre>
 * 
 * <pre> -C &lt;value&gt;
 *  cutoff factor, takes the given percentage of the splitted 
 *  centroids if none of the children win
 *  (default 0.0).</pre>
 * 
 * <pre> -D &lt;distance function class specification&gt;
 *  Full class name of Distance function class to use, followed
 *  by scheme options.
 *  (default weka.core.EuclideanDistance).</pre>
 * 
 * <pre> -N &lt;file name&gt;
 *  file to read starting centers from (ARFF format).</pre>
 * 
 * <pre> -O &lt;file name&gt;
 *  file to write centers to (ARFF format).</pre>
 * 
 * <pre> -U &lt;int&gt;
 *  The debug level.
 *  (default 0)</pre>
 * 
 * <pre> -Y &lt;file name&gt;
 *  The debug vectors file.</pre>
 * 
 * <pre> -S &lt;num&gt;
 *  Random number seed.
 *  (default 10)</pre>
 * 
 <!-- options-end -->
 *
 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz)
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @author Malcolm Ware (mfw4@cs.waikato.ac.nz)
 * @version $Revision: 1.24 $
 * @see RandomizableClusterer
 */
00155 public class XMeans 
  extends RandomizableClusterer
  implements TechnicalInformationHandler {

  /*
   * major TODOS:
   *
   * make BIC-Score replaceable by other scores
   */

  /** for serialization. */
00166   private static final long serialVersionUID = -7941793078404132616L;
  
  /** training instances. */
00169   protected Instances m_Instances = null;

  /** model information, should increase readability. */
00172   protected Instances m_Model = null;
  
  /** replace missing values in training instances. */
00175   protected ReplaceMissingValues m_ReplaceMissingFilter;

  /**
   * Distance value between true and false of binary attributes and 
   * "same" and "different" of nominal attributes (default = 1.0).
   */
00181   protected double m_BinValue = 1.0;

  /** BIC-Score of the current model. */
00184   protected double m_Bic = Double.MIN_VALUE;

  /** Distortion.  */
00187   protected double[] m_Mle = null;

  /** maximum overall iterations. */
00190   protected int m_MaxIterations = 1;

  /**
   * maximum iterations to perform Kmeans part 
   * if negative, iterations are not checked.
   */
00196   protected int m_MaxKMeans = 1000;

  /** see above, but for kMeans of splitted clusters.
   */
00200   protected int m_MaxKMeansForChildren = 1000;

  /** The actual number of clusters. */
00203   protected int m_NumClusters = 2;

  /** min number of clusters to generate. */
00206   protected int m_MinNumClusters = 2;

  /** max number of clusters to generate. */
00209   protected int m_MaxNumClusters = 4;

  /** the distance function used. */
00212   protected DistanceFunction m_DistanceF = new EuclideanDistance();

  /** cluster centers. */
00215   protected Instances m_ClusterCenters;

  /** file name of the output file for the cluster centers. */
00218   protected File m_InputCenterFile = new File(System.getProperty("user.dir"));

  /* --> DebugVectors - USED FOR DEBUGGING */
  /** input file for the random vectors --> USED FOR DEBUGGING. */
00222   protected Reader m_DebugVectorsInput = null;
  /** the index for the current debug vector. */
00224   protected int m_DebugVectorsIndex = 0;
  /** all the debug vectors. */
00226   protected Instances m_DebugVectors = null;

  /** file name of the input file for the random vectors. */
00229   protected File m_DebugVectorsFile = new File(System.getProperty("user.dir"));

  /** input file for the cluster centers. */
00232   protected Reader m_CenterInput = null;
    
  /** file name of the output file for the cluster centers. */
00235   protected File m_OutputCenterFile = new File(System.getProperty("user.dir"));
  
  /** output file for the cluster centers. */
00238   protected PrintWriter m_CenterOutput = null;
    
  /**
   * temporary variable holding cluster assignments while iterating.
   */
00243   protected int[] m_ClusterAssignments;

  /** cutoff factor - percentage of splits done in Improve-Structure part
     only relevant, if all children lost. */ 
00247   protected double m_CutOffFactor = 0.5;

  /** Index in ranges for LOW. */
00250   public static int R_LOW = 0;
  /** Index in ranges for HIGH. */
00252   public static int R_HIGH = 1;
  /** Index in ranges for WIDTH. */
00254   public static int R_WIDTH = 2;

  /**
   * KDTrees class if KDTrees are used.
   */
00259   protected KDTree m_KDTree = new KDTree();
  
  /** whether to use the KDTree (the KDTree is only initialized to be 
   * configurable from the GUI). */
00263   protected boolean m_UseKDTree = false;

  /** counts iterations done in main loop. */
00266   protected int m_IterationCount = 0;

  /** counter to say how often kMeans was stopped by loop counter. */
00269   protected int m_KMeansStopped = 0;

  /** Number of splits prepared. */
00272   protected int m_NumSplits = 0;

  /** Number of splits accepted (including cutoff factor decisions). */
00275   protected int m_NumSplitsDone = 0;

  /** Number of splits accepted just because of cutoff factor. */
00278   protected int m_NumSplitsStillDone = 0;

  /**
   * level of debug output, 0 is no output.
   */
00283   protected int m_DebugLevel = 0;
  
  /** print the centers. */
00286   public static int D_PRINTCENTERS = 1;
  /** follows the splitting of the centers. */
00288   public static int D_FOLLOWSPLIT = 2;
  /** have a closer look at converge children. */
00290   public static int D_CONVCHCLOSER = 3;
  /** check on random vectors. */
00292   public static int D_RANDOMVECTOR = 4;
  /** check on kdtree. */
00294   public static int D_KDTREE = 5;
  /** follow iterations. */
00296   public static int D_ITERCOUNT = 6;
  /** functions were maybe misused.  */
00298   public static int D_METH_MISUSE = 80; 
  /** for current debug.  */
00300   public static int D_CURR = 88;
  /** general debugging. */
00302   public static int D_GENERAL = 99;

  /** Flag: I'm debugging. */
00305   public boolean m_CurrDebugFlag = true;

  /**
   * the default constructor.
   */
00310   public XMeans() {
    super();
    
    m_SeedDefault = 10;
    setSeed(m_SeedDefault);
  }
  
  /**
   * Returns a string describing this clusterer.
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
00322   public String globalInfo() {
    return 
        "Cluster data using the X-means algorithm.\n\n" 
      + "X-Means is K-Means extended by an Improve-Structure part In this "
      + "part of the algorithm the centers are attempted to be split in "
      + "its region. The decision between the children of each center and "
      + "itself is done comparing the BIC-values of the two structures.\n\n"
      + "For more information see:\n\n"
      + getTechnicalInformation().toString();
  }

  /**
   * Returns an instance of a TechnicalInformation object, containing 
   * detailed information about the technical background of this class,
   * e.g., paper reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
00340   public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation      result;
    
    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "Dan Pelleg and Andrew W. Moore");
    result.setValue(Field.TITLE, "X-means: Extending K-means with Efficient Estimation of the Number of Clusters");
    result.setValue(Field.BOOKTITLE, "Seventeenth International Conference on Machine Learning");
    result.setValue(Field.YEAR, "2000");
    result.setValue(Field.PAGES, "727-734");
    result.setValue(Field.PUBLISHER, "Morgan Kaufmann");
    
    return result;
  }

  /**
   * Returns default capabilities of the clusterer.
   *
   * @return      the capabilities of this clusterer
   */
00359   public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();

    // attributes
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.DATE_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    return result;
  }
 
  /**
   * Generates the X-Means clusterer. 
   *
   * @param data set of instances serving as training data 
   * @throws Exception if the clusterer has not been 
   * generated successfully
   */
00377   public void buildClusterer(Instances data) throws Exception {

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_NumSplits = 0;
    m_NumSplitsDone = 0;
    m_NumSplitsStillDone = 0;

    // replace missing values
    m_ReplaceMissingFilter = new ReplaceMissingValues();
    m_ReplaceMissingFilter.setInputFormat(data);
    m_Instances = Filter.useFilter(data, m_ReplaceMissingFilter);
    
    // initialize random function
    Random random0 = new Random(m_Seed);

    // num of clusters to start with
    m_NumClusters =  m_MinNumClusters;

    // set distance function to default
    if (m_DistanceF == null) {
      m_DistanceF = new EuclideanDistance();
    }

    m_DistanceF.setInstances(m_Instances);
    checkInstances();

    if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile())
      initDebugVectorsInput();

    // make list of indexes for m_Instances
    int[] allInstList = new int[m_Instances.numInstances()]; 
    for (int i = 0; i < m_Instances.numInstances(); i++) {
      allInstList[i] = i;
    }
    
    // set model used (just for convenience)
    m_Model = new Instances(m_Instances, 0);

    // produce the starting centers
    if (m_CenterInput != null) {
      // read centers from file
      m_ClusterCenters = new Instances(m_CenterInput);
      m_NumClusters = m_ClusterCenters.numInstances();
    }
    else
      // makes the first centers randomly
      m_ClusterCenters = makeCentersRandomly(random0,
                                   m_Instances, m_NumClusters);
    PFD(D_FOLLOWSPLIT, "\n*** Starting centers ");
    for (int k = 0; k < m_ClusterCenters.numInstances(); k++) {
      PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k));
    }

    PrCentersFD(D_PRINTCENTERS);

    boolean finished = false;
    Instances children; 

    // builds up a KDTree
    if (m_UseKDTree)
      m_KDTree.setInstances(m_Instances);
  
    // loop counter of main loop
    m_IterationCount = 0;

    /**
     * "finished" does get true as soon as:
     * 1. number of clusters gets >= m_MaxClusters, 
     * 2. in the last round, none of the centers have been split
     * 
     * if number of clusters is already >= m_MaxClusters 
     * part 1 (= Improve-Params) is done at least once.
     */
    while (!finished &&
           !stopIteration(m_IterationCount, m_MaxIterations)) {
      
      /* ====================================================================
       * 1. Improve-Params                  
       *    conventional K-means
       */


      PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:");
      PrCentersFD(D_FOLLOWSPLIT);

      PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount + 
        ". time");
      m_IterationCount++;

      // prepare to converge
      boolean converged = false;

      // initialize assignments to -1
      m_ClusterAssignments = initAssignments(m_Instances.numInstances());
      // stores a list of indexes of instances belonging to each center
      int[][] instOfCent = new int[m_ClusterCenters.numInstances()][];

      // KMeans loop counter
      int kMeansIteration = 0;

      // converge in conventional K-means ----------------------------------
      PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:");
      while (!converged && 
           !stopKMeansIteration(kMeansIteration, m_MaxKMeans)) {
      
      kMeansIteration++;
      converged = true;
      
        // assign instances to centers -------------------------------------
        converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
                            m_ClusterCenters, 
                            instOfCent,
                            allInstList, 
                            m_ClusterAssignments,
                            kMeansIteration);
      
      PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:");
      PrCentersFD(D_FOLLOWSPLIT);
      // compute new centers = centers of mass of points
        converged = recomputeCenters(m_ClusterCenters, // clusters
                             instOfCent,       // their instances
                             m_Model);         // model information
      PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:");
      PrCentersFD(D_FOLLOWSPLIT);
      }
      PFD(D_FOLLOWSPLIT, "");
      PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means");

      /** =====================================================================
       * 2. Improve-Structur
       */

      // BIC before split distortioning the centres
      m_Mle = distortion(instOfCent, m_ClusterCenters);
      m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle);
      PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic);

      int currNumCent = m_ClusterCenters.numInstances();
      Instances splitCenters = new Instances(m_ClusterCenters, 
                                   currNumCent * 2);
      
      // store BIC values of parent and children
      double[] pbic = new double [currNumCent];
      double[] cbic = new double [currNumCent];
            
      // split each center
      for (int i = 0; i < currNumCent 
         // this could help to optimize the algorithm
         //      && currNumCent + numSplits <= m_MaxNumClusters
           ; 
         i++) {
      
      PFD(D_FOLLOWSPLIT, "\nsplit center " + i +
                  " " + m_ClusterCenters.instance(i));
      Instance currCenter = m_ClusterCenters.instance(i);
      int[] currInstList = instOfCent[i];
      int currNumInst = instOfCent[i].length;
      
      // not enough instances; than continue with next
      if (currNumInst <= 2) {
        pbic[i] = Double.MAX_VALUE;
        cbic[i] = 0.0;
        // add center itself as dummy
        splitCenters.add(currCenter);
        splitCenters.add(currCenter);
        continue;
      }
      
      // split centers  ----------------------------------------------
      double variance = m_Mle[i] / (double)currNumInst;
      children = splitCenter(random0, currCenter, variance, m_Model);
      
      // initialize assignments to -1
      int[] oneCentAssignments = initAssignments(currNumInst);
      int[][] instOfChCent = new int [2][]; // todo maybe split didn't work
      
      // converge the children  --------------------------------------
      converged = false;
      int kMeansForChildrenIteration = 0;
      PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i);
      while (!converged && 
          !stopKMeansIteration(kMeansForChildrenIteration, 
                         m_MaxKMeansForChildren)) {
        kMeansForChildrenIteration++;
        
        converged =
          assignToCenters(children, instOfChCent,
                      currInstList, oneCentAssignments);

        if (!converged) {       
          recomputeCentersFast(children, instOfChCent, m_Model);
        }
      } 

      // store new centers for later decision if they are taken
      splitCenters.add(children.instance(0));
      splitCenters.add(children.instance(1));

      PFD(D_FOLLOWSPLIT, "\nconverged cildren ");
      PFD(D_FOLLOWSPLIT, " " + children.instance(0));
      PFD(D_FOLLOWSPLIT, " " + children.instance(1));

      // compare parent and children model by their BIC-value
      pbic[i] = calculateBIC(currInstList, currCenter,  m_Mle[i], m_Model);
      double[] chMLE = distortion(instOfChCent, children);
      cbic[i] = calculateBIC(instOfChCent, children, chMLE);

      } // end of loop over clusters

      // decide which one to split and make new list of cluster centers
      Instances newClusterCenters = null;
      newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor,
                                                 splitCenters);
      /**
       * Compare with before Improve-Structure
       */
      int newNumClusters = newClusterCenters.numInstances();
      if (newNumClusters != m_NumClusters) {
      
      PFD(D_FOLLOWSPLIT, "Compare with non-split");

      // initialize assignments to -1
      int[] newClusterAssignments = 
        initAssignments(m_Instances.numInstances());
      
      // stores a list of indexes of instances belonging to each center
      int[][] newInstOfCent = new int[newClusterCenters.numInstances()][];
      
      // assign instances to centers -------------------------------------
      converged = assignToCenters(m_UseKDTree ? m_KDTree : null,
                            newClusterCenters, 
                            newInstOfCent,
                            allInstList, 
                            newClusterAssignments,
                            m_IterationCount);
      
      double[] newMle = distortion(newInstOfCent, newClusterCenters);
      double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle);
      PFD(D_FOLLOWSPLIT, "newBic " + newBic);
      if (newBic > m_Bic) {
          PFD(D_FOLLOWSPLIT, "*** decide for new clusters");
        m_Bic = newBic;
        m_ClusterCenters = newClusterCenters;
        m_ClusterAssignments = newClusterAssignments;
      } else {
          PFD(D_FOLLOWSPLIT, "*** keep old clusters");
        }
      }

      newNumClusters = m_ClusterCenters.numInstances();
      // decide if finished: max num cluster reached 
      // or last centers where not split at all 
      if ((newNumClusters >= m_MaxNumClusters) 
        || (newNumClusters == m_NumClusters)) {
      finished = true;
      }
      m_NumClusters = newNumClusters;
    }
  }

  /**
   * Checks for nominal attributes in the dataset.
   * Class attribute is ignored.
   * @param data the data to check
   * @return false if no nominal attributes are present
   */
00645   public boolean checkForNominalAttributes(Instances data) {

    int i = 0;
    while (i < data.numAttributes()) {
      if ((i != data.classIndex()) && data.attribute(i++).isNominal()) {
      return true;
      }
    }
    return false;
  }

  /**
   * Set array of int, used to store assignments, to -1.
   * @param ass integer array used for storing assignments
   * @return integer array used for storing assignments
   */
00661   protected int[] initAssignments(int[] ass) {
    for (int i = 0; i < ass.length; i++)
      ass[i] = -1;
    return ass;
  }    
 
  /**
   * Creates and initializes integer array, used to store assignments.
   * @param numInstances length of array used for assignments
   * @return integer array used for storing assignments
   */
00672   protected int[] initAssignments(int numInstances) {
    int[] ass = new int[numInstances];
    for (int i = 0; i < numInstances; i++)
      ass[i] = -1;
    return ass;
  }    
  
  /**
   * Creates and initializes boolean array.
   * @param len length of new array
   * @return the new array
   */
00684   boolean[] initBoolArray(int len) {
    boolean[] boolArray = new boolean [len];
    for (int i = 0; i < len; i++) {
      boolArray[i] = false;
    }
    return boolArray;
  }

  /**
   * Returns new center list.
   *
   * The following steps 1. and 2. both take care that the number of centers
   * does not exceed maxCenters.
   *
   * 1. Compare BIC values of parent and children and takes the one as
   * new centers which do win (= BIC-value is smaller).
   *
   * 2. If in 1. none of the children are chosen 
   *    && and cutoff factor is > 0
   * cutoff factor is taken as the percentage of "best" centers that are
   * still taken.
   * @param pbic array of parents BIC-values
   * @param cbic array of childrens BIC-values
   * @param cutoffFactor cutoff factor 
   * @param splitCenters all children 
   * @return the new centers
   */
00711   protected Instances newCentersAfterSplit(double[] pbic, 
                               double[] cbic,
                               double cutoffFactor,
                               Instances splitCenters) {

    // store if split won
    boolean splitPerCutoff = false;
    boolean takeSomeAway = false;
    boolean[] splitWon = initBoolArray(m_ClusterCenters.numInstances());
    int numToSplit = 0;
    Instances newCenters = null;
    
    // how many would be split, because the children have a better bic value
    for (int i = 0; i < cbic.length; i++) {
      if (cbic[i] > pbic[i]) {
      // decide for splitting ----------------------------------------
      splitWon[i] = true; numToSplit++;
      PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children");
      }
      else {
      // decide for parents and finished stays true  -----------------
      PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent");
      }
    }

    // no splits yet so split per cutoff factor
    if ((numToSplit == 0) && (cutoffFactor > 0)) {
      splitPerCutoff = true;
      
      // how many to split per cutoff factor
      numToSplit = (int) 
        ((double) m_ClusterCenters.numInstances() * m_CutOffFactor); 
    }

    // prepare indexes of values in ascending order  
    double[] diff = new double [m_NumClusters];
    for (int j = 0; j < diff.length; j++) {
      diff[j] = pbic[j] - cbic[j];
    }    
    int[] sortOrder = Utils.sort(diff);
    
    // check if maxNumClusters would be exceeded
    int possibleToSplit = m_MaxNumClusters - m_NumClusters; 

    if (possibleToSplit > numToSplit) {
      // still enough possible, do the whole amount
      possibleToSplit = numToSplit;
    }
    else
      takeSomeAway = true;

    // prepare for splitting the one that are supposed to be split
    if (splitPerCutoff) {
      for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0);
         j++) {
      splitWon[sortOrder[j]] = true;
      }
      m_NumSplitsStillDone += possibleToSplit;
    } 
    else {
      // take some splits away if max number of clusters would be exceeded
      if (takeSomeAway) {
      int count = 0;
      int j = 0;
      for (;j < splitWon.length && count < possibleToSplit; j++){
        if (splitWon[sortOrder[j]] == true) count++;
      }
      
      while (j < splitWon.length) {
        splitWon[sortOrder[j]] = false;
        j++;
      }
      }
    }
   
    // finally split
    if (possibleToSplit > 0) 
      newCenters = newCentersAfterSplit(splitWon, splitCenters);
    else
      newCenters = m_ClusterCenters;
    return newCenters;
  }

  /**
   * Returns new centers. Depending on splitWon: if true takes children, if
   * false takes parent = current center.
   * 
   * @param splitWon
   *          array of boolean to indicate to take split or not
   * @param splitCenters
   *          list of splitted centers
   * @return the new centers
   */
00804   protected Instances newCentersAfterSplit(boolean[] splitWon,
      Instances splitCenters) {
    Instances newCenters = new Instances(splitCenters, 0);

    int sIndex = 0;
    for (int i = 0; i < splitWon.length; i++) {
      if (splitWon[i]) {
        m_NumSplitsDone++;
        newCenters.add(splitCenters.instance(sIndex++));
        newCenters.add(splitCenters.instance(sIndex++));
      } else {
        sIndex++;
        sIndex++;
        newCenters.add(m_ClusterCenters.instance(i));
      }
    }
    return newCenters;
  }

  /**
   * Controls that counter does not exceed max iteration value. Special function
   * for kmeans iterations.
   * 
   * @param iterationCount
   *          current value of counter
   * @param max
   *          maximum value for counter
   * @return true if iteration should be stopped
   */ 
00833   protected boolean stopKMeansIteration(int iterationCount, int max) {
    boolean stopIterate = false;
    if (max >= 0) 
      stopIterate = (iterationCount >= max);
    if (stopIterate) 
      m_KMeansStopped++;
    return stopIterate;
  }

  /**
   * Checks if iterationCount has to be checked and if yes
   * (this means max is > 0) compares it with max.
   * 
   * @param iterationCount the current iteration count
   * @param max the maximum number of iterations
   * @return true if maximum has been reached
   */ 
00850   protected boolean stopIteration(int iterationCount, int max) {
    boolean stopIterate = false;
    if (max >= 0) 
      stopIterate = (iterationCount >= max);
    return stopIterate;
  }

  /**
   * Recompute the new centers. New cluster center is center of mass of its 
   * instances. Returns true if cluster stays the same.
   * @param centers the input and output centers
   * @param instOfCent the instances to the centers 
   * @param model data model information
   * @return true if converged.
   */
00865    protected boolean recomputeCenters(Instances centers,          
                           int[][] instOfCent, 
                           Instances model) {
    boolean converged = true;
    
    for (int i = 0; i < centers.numInstances(); i++) {
      double val;
      for (int j = 0; j < model.numAttributes(); j++) {
      val = meanOrMode(m_Instances, instOfCent[i], j);

      for (int k = 0; k < instOfCent[i].length; k++)

      if (converged && m_ClusterCenters.instance(i).value(j) != val) 
        converged = false;
      if (!converged)
        m_ClusterCenters.instance(i).setValue(j, val);
      }
    }
    return converged;
  }

  /**
   * Recompute the new centers - 2nd version 
   * Same as recomputeCenters, but does not check if center stays the same.
   * 
   * @param centers the input center and output centers
   * @param instOfCentIndexes the indexes of the instances to the centers 
   * @param model data model information
   */
00894   protected void recomputeCentersFast(Instances centers,          
                            int[][] instOfCentIndexes, 
                            Instances model   
                            ) {
    for (int i = 0; i < centers.numInstances(); i++) {
      double val;
      for (int j = 0; j < model.numAttributes(); j++) {
      val = meanOrMode(m_Instances, instOfCentIndexes[i], j);
      centers.instance(i).setValue(j, val);
      }
    }
  }

  /**
   * Computes Mean Or Mode of one attribute on a subset of m_Instances. 
   * The subset is defined by an index list.
   * @param instances all instances
   * @param instList the indexes of the instances the mean is computed from
   * @param attIndex the index of the attribute
   * @return mean value
   */
00915   protected double meanOrMode(Instances instances, 
                      int[] instList, 
                      int attIndex) {
    double result, found;
    int[] counts;
    int numInst = instList.length;

    if (instances.attribute(attIndex).isNumeric()) {
      result = found = 0;
      for (int j = 0; j < numInst; j++) {
      Instance currInst = instances.instance(instList[j]);
      if (!currInst.isMissing(attIndex)) {
        found += currInst.weight();
        result += currInst.weight() * 
          currInst.value(attIndex);
      }
      }
      if (Utils.eq(found, 0)) {
      return 0;
      } else {
      return result / found;
      }
    } else if (instances.attribute(attIndex).isNominal()) {
      counts = new int[instances.attribute(attIndex).numValues()];
      for (int j = 0; j < numInst; j++) {
      Instance currInst = instances.instance(instList[j]);
      if (!currInst.isMissing(attIndex)) {
        counts[(int) currInst.value(attIndex)] += currInst.weight();
      }
      }
      return (double)Utils.maxIndex(counts);
    } else {
      return 0;
    }
  }

 
  /**
   * Assigns instances to centers.
   *
   * @param tree KDTree on all instances
   * @param centers all the input centers
   * @param instOfCent the instances to each center
   * @param allInstList list of all instances
   * @param assignments assignments of instances to centers
   * @param iterationCount the number of iteration 
   * @return true if converged
   * @throws Exception is something goes wrong
   */
00964   protected boolean assignToCenters(KDTree tree,
                                  Instances centers, 
                          int[][] instOfCent, 
                                  int[] allInstList,
                          int[] assignments,
                                  int iterationCount) throws Exception {
    
    boolean converged = true;
    if (tree != null) {
      // using KDTree structure for assigning
      converged = assignToCenters(tree,
                          centers, 
                          instOfCent,
                          assignments,
                          iterationCount);
    } else {
      converged = assignToCenters(centers, 
                          instOfCent,
                          allInstList, 
                          assignments);
    }
    return converged;
  }

  /**
   * Assign instances to centers using KDtree.
   * First part of conventionell K-Means, returns true if new assignment
   * is the same as the last one.
   *
   * @param kdtree KDTree on all instances
   * @param centers all the input centers
   * @param instOfCent the instances to each center
   * @param assignments assignments of instances to centers
   * @param iterationCount the number of iteration 
   * @return true if converged
   * @throws Exception in case instances are not assigned to cluster
   */
01001   protected boolean assignToCenters(KDTree kdtree,
                          Instances centers, 
                          int[][] instOfCent, 
                          int[] assignments,
                                  int iterationCount) throws Exception {

    int numCent = centers.numInstances();
    int numInst = m_Instances.numInstances(); 
    int[] oldAssignments = new int[numInst];
    
    // WARNING:  assignments is "input/output-parameter"
    // should not be null
    if (assignments == null) {
      assignments = new int[numInst];
      for (int i = 0; i < numInst; i++) {
      assignments[0] = -1;
      }
    }
    
    // WARNING:  instOfCent is "input/output-parameter"
    // should not be null
    if (instOfCent == null) {
      instOfCent = new int [numCent][];
    }
    
    // save old assignments
    for (int i = 0; i < assignments.length; i++) {
      oldAssignments[i] = assignments[i];
    }
    
    // use tree to get new assignments
    kdtree.centerInstances(centers, assignments,
                     Math.pow(.8, iterationCount));   
    boolean converged = true;
  
    // compare with previous assignment
    for (int i = 0; converged && (i < assignments.length); i++) {
      converged = (oldAssignments[i] == assignments[i]);
      if (assignments[i] == -1) 
      throw new Exception("Instance " + i + 
                      " has not been assigned to cluster.");
    }

    if (!converged) {
      int[] numInstOfCent = new int[numCent];
      for (int i = 0; i < numCent; i++)
      numInstOfCent[i] = 0;

      // count num of assignments per center
      for (int i = 0; i < numInst; i++)
      numInstOfCent[assignments[i]]++;
      
      // prepare instancelists per center
      for (int i = 0; i < numCent; i++){
      instOfCent[i] = new int[numInstOfCent[i]];
      }
      // write instance lists per center
      for (int i = 0; i < numCent; i++) {
      int index = -1;   
      for (int j = 0; j < numInstOfCent[i]; j++) {
        index = nextAssignedOne(i, index, assignments);
        instOfCent[i][j] = index;
      }
      }
    }
  
    return converged;
  }

  /**
   * Assign instances to centers.
   * Part of conventionell K-Means, returns true if new assignment
   * is the same as the last one.
   *
   * @param centers all the input centers
   * @param instOfCent the instances to each center
   * @param allInstList list of all indexes
   * @param assignments assignments of instances to centers
   * @return true if converged
   * @throws Exception if something goes wrong
   */
01082   protected boolean assignToCenters(Instances centers, 
                          int[][] instOfCent,
                          int[] allInstList,
                          int[] assignments) 
    throws Exception {
    
    // todo: undecided situations
    boolean converged = true; // true if new assignment is the same 
                              // as the old one

    int numInst = allInstList.length; 
    int numCent = centers.numInstances();
    int[] numInstOfCent = new int [numCent];
    for (int i = 0; i < numCent; i++) numInstOfCent[i] = 0;

    // WARNING:  assignments is "input/output-parameter"
    // should not be null
    if (assignments == null) {
      assignments = new int[numInst];
      for (int i = 0; i < numInst; i++) {
      assignments[i] = -1;
      }
    }

    // WARNING: instOfCent is "input/output-parameter"
    // should not be null
    if (instOfCent == null) {
      instOfCent = new int [numCent][];
    }

    // set assignments
    for (int i = 0; i < numInst; i++) {
      Instance inst = m_Instances.instance(allInstList[i]);
      int newC = clusterProcessedInstance(inst, centers);
      
      if (converged && newC != assignments[i]) {
      converged = false;
      }

      numInstOfCent[newC]++; 
      if (!converged)
      assignments[i] = newC;
    }

    // the following is only done
    // if assignments are not the same, because too much effort
    if (!converged) {
      PFD(D_FOLLOWSPLIT, "assignToCenters -> it has NOT converged");
      for (int i = 0; i < numCent; i++) {
      instOfCent[i] = new int [numInstOfCent[i]];
      }

      for (int i = 0; i < numCent; i++) {
      int index = -1;   
      for (int j = 0; j < numInstOfCent[i]; j++) {
        index = nextAssignedOne(i, index, assignments);
        instOfCent[i][j] = allInstList[index];
      }
      }
    }
    else
      PFD(D_FOLLOWSPLIT, "assignToCenters -> it has converged");

    return converged;
  }

  /**
   * Searches along the assignment array for the next entry of the center 
   * in question.
   * @param cent index of the center 
   * @param lastIndex index to start searching
   * @param assignments assignments
   * @return index of the instance the center cent is assigned to
   */
01156   protected int nextAssignedOne(int cent, int lastIndex, 
                        int[] assignments) {
    int len = assignments.length;
    int index = lastIndex + 1;
    while (index < len) {
      if (assignments[index] == cent) {
      return (index);
      }
      index++;
    }
    return (-1);
  }

  /**
   * Split centers in their region. Generates random vector of 
   * length = variance and
   * adds and substractsx to cluster vector to get two new clusters.
   * 
   * @param random random function
   * @param center the center that is split here
   * @param variance variance of the cluster 
   * @param model data model valid
   * @return a pair of new centers
   * @throws Exception something in AlgVector goes wrong
   */
01181   protected Instances splitCenter(Random random,
                          Instance center,
                          double variance,
                          Instances model) throws Exception {
    m_NumSplits++;
    AlgVector r = null;
    Instances children = new Instances(model, 2);

    if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile()) {
      Instance nextVector = getNextDebugVectorsInstance(model);
      PFD(D_RANDOMVECTOR, "Random Vector from File " + nextVector);
      r = new AlgVector(nextVector);
    }
    else {
      // random vector of length = variance
      r = new AlgVector(model, random);
    }
    r.changeLength(Math.pow(variance, 0.5));
    PFD(D_RANDOMVECTOR, "random vector *variance "+ r);
    
    // add random vector to center
    AlgVector c = new AlgVector(center);
    AlgVector c2 = (AlgVector) c.clone();
    c = c.add(r);
    Instance newCenter = c.getAsInstance(model, random);
    children.add(newCenter);
    PFD(D_FOLLOWSPLIT, "first child "+ newCenter);
    
    // substract random vector to center
    c2 = c2.substract(r);
    newCenter = c2.getAsInstance(model, random);
    children.add(newCenter);
    PFD(D_FOLLOWSPLIT, "second child "+ newCenter);

    return children;
  }

  /**
   * Split centers in their region.
   * (*Alternative version of splitCenter()*) 
   * 
   * @param random the random number generator
   * @param instances of the region
   * @param model the model for the centers 
   * (should be the same as that of instances)
   * @return a pair of new centers
   */
01228   protected Instances splitCenters(Random random,
                         Instances instances,
                         Instances model) {
    Instances children = new Instances(model, 2);
    int instIndex = Math.abs(random.nextInt()) % instances.numInstances();
    children.add(instances.instance(instIndex));
    int instIndex2 = instIndex;
    int count = 0;
    while ((instIndex2 == instIndex) && count < 10) {
      count++;
      instIndex2 = Math.abs(random.nextInt()) % instances.numInstances();
    }
    children.add(instances.instance(instIndex2));
    
    return children;
  }

  /**
   * Generates new centers randomly. Used for starting centers.
   *
   * @param random0 random number generator
   * @param model data model of the instances
   * @param numClusters number of clusters
   * @return new centers
   */
01253   protected Instances makeCentersRandomly(Random random0,
                              Instances model,
                              int numClusters) {
    Instances clusterCenters = new Instances(model, numClusters);
    m_NumClusters = numClusters;

    // makes the new centers randomly
    for (int i = 0; i < numClusters; i++) {
      int instIndex = Math.abs(random0.nextInt()) % m_Instances.numInstances();
      clusterCenters.add(m_Instances.instance(instIndex));
    }
    return clusterCenters;
  }

  /**
   * Returns the BIC-value for the given center and instances.
   * @param instList The indices of the instances that belong to the center
   * @param center the center.
   * @param mle maximum likelihood
   * @param model the data model
   * @return the BIC value 
   */   
01275   protected double calculateBIC(int[] instList, Instance center,
                        double mle, Instances model) {
    int[][] w1 = new int[1][instList.length];
    for (int i = 0; i < instList.length; i++) {
      w1[0][i] = instList[i];
    }
    double[] m = {mle};
    Instances w2 = new Instances(model, 1);
    w2.add(center);
    return calculateBIC(w1, w2, m);
    }
  
  /**
   * Calculates the BIC for the given set of centers and instances.
   * @param instOfCent The instances that belong to their respective centers
   * @param centers the centers
   * @param mle maximum likelihood
   * @return The BIC for the input.
   */
01294   protected double calculateBIC(int[][] instOfCent, Instances centers,
                        double[] mle) {
    double loglike = 0.0;
    int numInstTotal = 0;
    int numCenters = centers.numInstances();
    int numDimensions = centers.numAttributes();
    int numParameters = (numCenters - 1) + //probabilities
      numCenters * numDimensions + //means
      numCenters; // variance params
    for (int i = 0; i < centers.numInstances(); i++) {
      loglike += logLikelihoodEstimate(instOfCent[i].length, centers.instance(i),
                               mle[i], centers.numInstances() * 2);
      numInstTotal += instOfCent[i].length;
    }
    /* diff
       thats how we did it
    loglike -= ((centers.numAttributes() + 1.0) * centers.numInstances() * 1)
      * Math.log(count);
      */
    loglike -= numInstTotal * Math.log(numInstTotal);
    //System.out.println ("numInstTotal " + numInstTotal +
    //                    "calculateBIC res " + loglike);
    loglike -= (numParameters / 2.0) * Math.log(numInstTotal);
    //System.out.println ("numParam " +
    //                     + numParameters +
    //                  " calculateBIC res " + loglike);
    return loglike;
  }
  
  /**
   * Calculates the log-likelihood of the data for the given model, taken
   * at the maximum likelihood point.
   *
   * @param numInst number of instances that belong to the center
   * @param center the center
   * @param distortion distortion 
   * @param numCent number of centers 
   * @return the likelihood estimate
   */
01333   protected double logLikelihoodEstimate(int numInst, 
                               Instance center, 
                               double distortion, 
                               int numCent) {
    // R(n) num of instances of the center -> numInst
    // K num of centers -> not used
    //
    //todo take the diff comments away
    double loglike = 0;
    /* if is new */
    if (numInst > 1) {
      /* diff variance is new */
      //
      // distortion = Sum over instances x of the center(x-center)
      // different to paper; sum should be squared
      //
      // (Sum of distances to center) / R(n) - 1.0
      // different to paper; should be R(n)-K
      double variance =  distortion / (numInst - 1.0); 
  
      //
      //  -R(n)/2 * log(pi*2)
      //
      double p1 = - (numInst / 2.0) * Math.log(Math.PI * 2.0);
      /* diff
       thats how we had it
       double p2 = -((ni * center.numAttributes()) / 2) * distortion;
      */
      //
      // -(R(n)*M)/2 * log(variance) 
      //
      double p2 = - (numInst * center.numAttributes()) / 2 * Math.log(variance);
      
      /* diff
       thats how we had it, the difference is a bug in x-means
       double p3 = - (numInst - numCent) / 2;
      */
      //
      // -(R(n)-1)/2
      //
      double p3 = - (numInst - 1.0) / 2.0;
      
      //
      // R(n)*log(R(n))
      //
      double p4 = numInst * Math.log(numInst);
      
      /* diff x-means doesn't have this part 
       double p5 = - numInst * Math.log(numInstTotal);
      */
      
      /*
      loglike = -(ni / 2) * Math.log(Math.PI * 2) 
      - (ni * center.numAttributes()) / 2.0) * logdistortion
      - (ni - k) / 2.0 
      + ni * Math.log(ni) 
      - ni * Math.log(r);
      */
      loglike = p1 + p2 + p3 + p4; // diff + p5;
      //the log(r) is something that can be reused.
      //as is the log(2 PI), these could provide extra speed up later on.
      //since distortion is so expensive to compute, I only do that once.
    }
    return loglike;
  }
  
  /**
   * Calculates the maximum likelihood estimate for the variance.
   * @param instOfCent indices of instances to each center
   * @param centers the centers
   * @return the list of distortions distortion.
   */
01405   protected double[] distortion(int[][] instOfCent, Instances centers) {
    double[] distortion = new double[centers.numInstances()];
    for (int i = 0; i < centers.numInstances(); i++) {
      distortion[i] = 0.0;
      for (int j = 0; j < instOfCent[i].length; j++) {
        distortion[i] += m_DistanceF.distance(m_Instances
            .instance(instOfCent[i][j]), centers.instance(i));
      }
    }
    /*
     * diff not done in x-means res *= 1.0 / (count - centers.numInstances());
     */
    return distortion;
  }
  
  /**
   * Clusters an instance.
   * 
   * @param instance
   *          the instance to assign a cluster to.
   * @param centers
   *          the centers to cluster the instance to.
   * @return a cluster index.
   */
01429   protected int clusterProcessedInstance(Instance instance, Instances centers) {
    
    double minDist = Integer.MAX_VALUE;
    int bestCluster = 0;
    for (int i = 0; i < centers.numInstances(); i++) {
      double dist = m_DistanceF.distance(instance, centers.instance(i));

      if (dist < minDist) {
        minDist = dist;
        bestCluster = i;
      }
    }
    ;
    return bestCluster;
  }
  
  /**
   * Clusters an instance that has been through the filters.
   * 
   * @param instance
   *          the instance to assign a cluster to
   * @return a cluster number
   */
01452   protected int clusterProcessedInstance(Instance instance) {
    double minDist = Integer.MAX_VALUE;
    int bestCluster = 0;
    for (int i = 0; i < m_NumClusters; i++) {
      double dist = m_DistanceF
          .distance(instance, m_ClusterCenters.instance(i));
      if (dist < minDist) {
        minDist = dist;
        bestCluster = i;
      }
    }
    return bestCluster;
  }

  /**
   * Classifies a given instance.
   *
   * @param instance the instance to be assigned to a cluster
   * @return the number of the assigned cluster as an integer
   * if the class is enumerated, otherwise the predicted value
   * @throws Exception if instance could not be classified
   * successfully
   */
01475   public int clusterInstance(Instance instance) throws Exception {
    m_ReplaceMissingFilter.input(instance);
    Instance inst = m_ReplaceMissingFilter.output();

    return clusterProcessedInstance(inst);
  }


  /**
   * Returns the number of clusters.
   *
   * @return the number of clusters generated for a training dataset.
   */
01488   public int numberOfClusters() {
    return m_NumClusters;
  }


  /**
   * Returns an enumeration describing the available options. 
   * @return an enumeration of all the available options
   **/
01497   public Enumeration listOptions() {
    Vector result = new Vector();
    
    result.addElement(new Option(
      "\tmaximum number of overall iterations\n"
      + "\t(default 1).", 
      "I", 1, "-I <num>"));
    
    result.addElement(new Option(
      "\tmaximum number of iterations in the kMeans loop in\n"
      + "\tthe Improve-Parameter part \n"
      + "\t(default 1000).", 
      "M", 1, "-M <num>"));
    
    result.addElement(new Option(
      "\tmaximum number of iterations in the kMeans loop\n"
      + "\tfor the splitted centroids in the Improve-Structure part \n"
      + "\t(default 1000).",
      "J", 1, "-J <num>"));
    
    result.addElement(new Option(
      "\tminimum number of clusters\n"
      + "\t(default 2).", 
      "L", 1, "-L <num>"));
    
    result.addElement(new Option(
      "\tmaximum number of clusters\n"
      + "\t(default 4).",
      "H", 1, "-H <num>"));
    
    result.addElement(new Option(
      "\tdistance value for binary attributes\n"
      + "\t(default 1.0).",
      "B", 1, "-B <value>"));
    
    result.addElement(new Option(
      "\tUses the KDTree internally\n"
      + "\t(default no).",
      "use-kdtree", 0, "-use-kdtree"));
    
    result.addElement(new Option(
      "\tFull class name of KDTree class to use, followed\n"
      + "\tby scheme options.\n"
      + "\teg: \"weka.core.neighboursearch.kdtrees.KDTree -P\"\n"
      + "\t(default no KDTree class used).",
      "K", 1, "-K <KDTree class specification>"));
    
    result.addElement(new Option(
      "\tcutoff factor, takes the given percentage of the splitted \n"
      + "\tcentroids if none of the children win\n"
      + "\t(default 0.0).",
      "C", 1, "-C <value>"));
    
    result.addElement(new Option(
      "\tFull class name of Distance function class to use, followed\n"
      + "\tby scheme options.\n" +
      "\t(default weka.core.EuclideanDistance).",
      "D", 1, "-D <distance function class specification>"));
    
    result.addElement(new Option(
      "\tfile to read starting centers from (ARFF format).",
      "N", 1, "-N <file name>"));
    
    result.addElement(new Option(
      "\tfile to write centers to (ARFF format).",
      "O", 1, "-O <file name>"));
    
    result.addElement(new Option(
      "\tThe debug level.\n"
      + "\t(default 0)",
      "U", 1, "-U <int>"));
    
    result.addElement(new Option(
      "\tThe debug vectors file.",
      "Y", 1, "-Y <file name>"));
    
    Enumeration en = super.listOptions();
    while (en.hasMoreElements())
      result.addElement(en.nextElement());
    
    return result.elements();
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property 
   */
01584   public String minNumClustersTipText() {
    return "set minimum number of clusters";
  }

  /**
   * Sets the minimum number of clusters to generate.
   *
   * @param n the minimum number of clusters to generate
   */
01593   public void setMinNumClusters(int n) {
    if (n <= m_MaxNumClusters) {
      m_MinNumClusters = n;
    }
  }

  /**
   * Gets the minimum number of clusters to generate.
   * @return the minimum number of clusters to generate
   */
01603   public int getMinNumClusters() {
    return m_MinNumClusters;
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property 
   */
01611   public String maxNumClustersTipText() {
    return "set maximum number of clusters";
  }

  /**
   * Sets the maximum number of clusters to generate.
   * @param n the maximum number of clusters to generate
   */
01619   public void setMaxNumClusters(int n) {
    if (n >= m_MinNumClusters) {
      m_MaxNumClusters = n;
    }
  }
  
  /**
   * Gets the maximum number of clusters to generate.
   * @return the maximum number of clusters to generate
   */
01629   public int getMaxNumClusters() {
    return m_MaxNumClusters;
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property 
   */
01637   public String maxIterationsTipText() {
    return "the maximum number of iterations to perform";
  }

  /**
   * Sets the maximum number of iterations to perform.
   * @param i the number of iterations
   * @throws Exception if i is less than 1
   */
01646   public void setMaxIterations(int i) throws Exception {
    if (i < 0) 
      throw new Exception("Only positive values for iteration number" +
                           " allowed (Option I)."); 
    m_MaxIterations = i;
  }

  /**
   * Gets the maximum number of iterations.
   * @return the number of iterations
   */
01657   public int getMaxIterations() {
    return  m_MaxIterations;
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property 
   */
01665   public String maxKMeansTipText() {
    return "the maximum number of iterations to perform in KMeans";
  }

  /**
   * Set the maximum number of iterations to perform in KMeans.
   * @param i the number of iterations
   */
01673   public void setMaxKMeans(int i) {
    m_MaxKMeans = i;
    m_MaxKMeansForChildren = i;
  }

  /**
   * Gets the maximum number of iterations in KMeans.
   * @return the number of iterations
   */
01682   public int getMaxKMeans() {
    return  m_MaxKMeans;
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property 
   */
01690   public String maxKMeansForChildrenTipText() {
    return "the maximum number of iterations KMeans that is performed on the child centers";
  }

  /**
   * Sets the maximum number of iterations KMeans that is performed 
   * on the child centers.
   * @param i the number of iterations
   */
01699   public void setMaxKMeansForChildren(int i) {
    m_MaxKMeansForChildren = i;
  }

  /**
   * Gets the maximum number of iterations in KMeans.
   * @return the number of iterations
   */
01707   public int getMaxKMeansForChildren() {
    return  m_MaxKMeansForChildren;
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property 
   */
01715   public String cutOffFactorTipText() {
    return "the cut-off factor to use";
  }

  /**
   * Sets a new cutoff factor.
   * @param i the new cutoff factor
   */
01723   public void setCutOffFactor(double i) {
    m_CutOffFactor = i;
  }

  /**
   * Gets the cutoff factor.
   * @return the cutoff factor
   */
01731   public double getCutOffFactor() {
    return  m_CutOffFactor;
  }

  /**
   * Returns the tip text for this property.
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
01740   public String binValueTipText() {
    return "Set the value that represents true in the new attributes.";
  }
  
  /**
   * Gets value that represents true in a new numeric attribute.
   * (False is always represented by 0.0.)
   * @return the value that represents true in a new numeric attribute
   */
01749   public double getBinValue() {
    return m_BinValue;
  }

  /**
   * Sets the distance value between true and false of binary attributes.
   * and  "same" and "different" of nominal attributes    
   * @param value the distance
   */
01758   public void setBinValue(double value) {
    m_BinValue = value;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return            tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
01768   public String distanceFTipText() {
    return "The distance function to use.";
  }

  /**
   * gets the "binary" distance value.
   * @param distanceF the distance function with all options set
   */
01776   public void setDistanceF(DistanceFunction distanceF) {
    m_DistanceF = distanceF;
  }

  /**
   * Gets the distance function.
   * @return the distance function
   */
01784   public DistanceFunction getDistanceF() {
    return m_DistanceF;
  }

  /**
   * Gets the distance function specification string, which contains the 
   * class name of the distance function class and any options to it.
   *
   * @return the distance function specification string
   */
01794   protected String getDistanceFSpec() {
    
    DistanceFunction d = getDistanceF();
    if (d instanceof OptionHandler) {
      return d.getClass().getName() + " "
      + Utils.joinOptions(((OptionHandler) d).getOptions());
    }
    return d.getClass().getName();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return            tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
01810   public String debugVectorsFileTipText() {
    return "The file containing the debug vectors (only for debugging!).";
  }
  
  /**
   * Sets the file that has the random vectors stored.
   * Only used for debugging reasons.
   * @param value the file to read the random vectors from
   */
01819   public void setDebugVectorsFile(File value) {
    m_DebugVectorsFile = value;
  }

  /**
   * Gets the file name for a file that has the random vectors stored.
   * Only used for debugging purposes.
   * @return the file to read the vectors from
   */
01828   public File getDebugVectorsFile() {
    return m_DebugVectorsFile;
  }
  
  /**
   * Initialises the debug vector input.
   * @throws Exception if there is error 
   * opening the debug input file.
   */
01837   public void initDebugVectorsInput() throws Exception {
    m_DebugVectorsInput = 
      new BufferedReader(new FileReader(m_DebugVectorsFile));
    m_DebugVectors = new Instances(m_DebugVectorsInput);
    m_DebugVectorsIndex = 0;
  }

  /**
   * Read an instance from debug vectors file.
   * @param model the data model for the instance.
   * @throws Exception if there are no debug vector 
   * in m_DebugVectors.
   * @return the next debug vector.
   */
01851   public Instance getNextDebugVectorsInstance(Instances model) 
    throws Exception {
    if (m_DebugVectorsIndex >= m_DebugVectors.numInstances())
      throw new Exception("no more prefabricated Vectors");
    Instance nex = m_DebugVectors.instance(m_DebugVectorsIndex);
    nex.setDataset(model);
    m_DebugVectorsIndex++;
    return nex;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return            tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
01867   public String inputCenterFileTipText() {
    return "The file to read the list of centers from.";
  }

  /**
   * Sets the file to read the list of centers from.
   *
   * @param value the file to read centers from
   */
01876   public void setInputCenterFile(File value) {
    m_InputCenterFile = value;
  }
  
  /**
   * Gets the file to read the list of centers from.
   *
   * @return the file to read the centers from
   */
01885   public File getInputCenterFile() {
    return m_InputCenterFile;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return            tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
01895   public String outputCenterFileTipText() {
    return "The file to write the list of centers to.";
  }
    
  /**
   * Sets file to write the list of centers to. 
   *
   * @param value file to write centers to
   */
01904   public void setOutputCenterFile(File value) {
    m_OutputCenterFile = value;
  }

  /**
   * Gets the file to write the list of centers to. 
   * 
   * @return filename of the file to write centers to
   */
01913   public File getOutputCenterFile() {
    return m_OutputCenterFile;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return            tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
01923   public String KDTreeTipText() {
    return "The KDTree to use.";
  }
    
  /**
   * Sets the KDTree class.
   * @param k a KDTree object with all options set
   */
01931   public void setKDTree(KDTree k) {
    m_KDTree = k;
  }

  /**
   * Gets the KDTree class.
   * 
   * @return the configured KDTree
   */
01940   public KDTree getKDTree() {
    return m_KDTree;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return            tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
01950   public String useKDTreeTipText() {
    return "Whether to use the KDTree.";
  }
    
  /**
   * Sets whether to use the KDTree or not.
   * 
   * @param value if true the KDTree is used
   */
01959   public void setUseKDTree(boolean value) {
    m_UseKDTree = value;
  }

  /**
   * Gets whether the KDTree is used or not.
   * 
   * @return            true if KDTrees are used
   */
01968   public boolean getUseKDTree() {
    return m_UseKDTree;
  }

  /**
   * Gets the KDTree specification string, which contains the class name of
   * the KDTree class and any options to the KDTree.
   *
   * @return the KDTree string.
   */
01978   protected String getKDTreeSpec() {
    
    KDTree c = getKDTree();
    if (c instanceof OptionHandler) {
      return c.getClass().getName() + " "
      + Utils.joinOptions(((OptionHandler)c).getOptions());
    }
    return c.getClass().getName();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return            tip text for this property suitable for
   *              displaying in the explorer/experimenter gui
   */
01994   public String debugLevelTipText() {
    return "The debug level to use.";
  }

  /**
   * Sets the debug level.
   * debug level = 0, means no output
   * @param d debuglevel
   */
02003   public void setDebugLevel(int d) {
    m_DebugLevel = d;
  }

  /**
   * Gets the debug level.
   * @return debug level
   */
02011   public int getDebugLevel() {
    return m_DebugLevel;
  }

  /**
   * Checks the instances.
   * No checks in this KDTree but it calls the check of the distance function.
   */
02019   protected void checkInstances () {
    
   // m_DistanceF.checkInstances();
  }
  
  /**
   * Parses a given list of options. <p/>
   * 
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -I &lt;num&gt;
   *  maximum number of overall iterations
   *  (default 1).</pre>
   * 
   * <pre> -M &lt;num&gt;
   *  maximum number of iterations in the kMeans loop in
   *  the Improve-Parameter part 
   *  (default 1000).</pre>
   * 
   * <pre> -J &lt;num&gt;
   *  maximum number of iterations in the kMeans loop
   *  for the splitted centroids in the Improve-Structure part 
   *  (default 1000).</pre>
   * 
   * <pre> -L &lt;num&gt;
   *  minimum number of clusters
   *  (default 2).</pre>
   * 
   * <pre> -H &lt;num&gt;
   *  maximum number of clusters
   *  (default 4).</pre>
   * 
   * <pre> -B &lt;value&gt;
   *  distance value for binary attributes
   *  (default 1.0).</pre>
   * 
   * <pre> -use-kdtree
   *  Uses the KDTree internally
   *  (default no).</pre>
   * 
   * <pre> -K &lt;KDTree class specification&gt;
   *  Full class name of KDTree class to use, followed
   *  by scheme options.
   *  eg: "weka.core.neighboursearch.kdtrees.KDTree -P"
   *  (default no KDTree class used).</pre>
   * 
   * <pre> -C &lt;value&gt;
   *  cutoff factor, takes the given percentage of the splitted 
   *  centroids if none of the children win
   *  (default 0.0).</pre>
   * 
   * <pre> -D &lt;distance function class specification&gt;
   *  Full class name of Distance function class to use, followed
   *  by scheme options.
   *  (default weka.core.EuclideanDistance).</pre>
   * 
   * <pre> -N &lt;file name&gt;
   *  file to read starting centers from (ARFF format).</pre>
   * 
   * <pre> -O &lt;file name&gt;
   *  file to write centers to (ARFF format).</pre>
   * 
   * <pre> -U &lt;int&gt;
   *  The debug level.
   *  (default 0)</pre>
   * 
   * <pre> -Y &lt;file name&gt;
   *  The debug vectors file.</pre>
   * 
   * <pre> -S &lt;num&gt;
   *  Random number seed.
   *  (default 10)</pre>
   * 
   <!-- options-end -->
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
02098   public void setOptions(String[] options)
    throws Exception {
    
    String  optionString;
    String  funcString;

    optionString = Utils.getOption('I', options);
    if (optionString.length() != 0)
      setMaxIterations(Integer.parseInt(optionString));
    else
      setMaxIterations(1);
    
    optionString = Utils.getOption('M', options);
    if (optionString.length() != 0)
      setMaxKMeans(Integer.parseInt(optionString));
    else
      setMaxKMeans(1000);
    
    optionString = Utils.getOption('J', options);
    if (optionString.length() != 0)
      setMaxKMeansForChildren(Integer.parseInt(optionString));
    else
      setMaxKMeansForChildren(1000);
      
    optionString = Utils.getOption('L', options);
    if (optionString.length() != 0)
      setMinNumClusters(Integer.parseInt(optionString));
    else
      setMinNumClusters(2);
      
    optionString = Utils.getOption('H', options);
    if (optionString.length() != 0)
      setMaxNumClusters(Integer.parseInt(optionString));
    else
      setMaxNumClusters(4);
    
    optionString = Utils.getOption('B', options);
    if (optionString.length() != 0)
      setBinValue(Double.parseDouble(optionString));
    else
      setBinValue(1.0);

    setUseKDTree(Utils.getFlag("use-kdtree", options));
    
    if (getUseKDTree()) {
      funcString = Utils.getOption('K', options);
      if (funcString.length() != 0) {
      String[] funcSpec = Utils.splitOptions(funcString);
      if (funcSpec.length == 0) {
        throw new Exception("Invalid function specification string");
      }
      String funcName = funcSpec[0];
      funcSpec[0] = "";
      setKDTree((KDTree) Utils.forName(KDTree.class, funcName, funcSpec));
      }
      else {
      setKDTree(new KDTree());
      }
    }
    else {
      setKDTree(new KDTree());
    }

    optionString = Utils.getOption('C', options);
    if (optionString.length() != 0)
      setCutOffFactor(Double.parseDouble(optionString));
    else
      setCutOffFactor(0.0);
    
    funcString = Utils.getOption('D', options);
    if (funcString.length() != 0) {
      String[] funcSpec = Utils.splitOptions(funcString);
      if (funcSpec.length == 0) {
      throw new Exception("Invalid function specification string");
      }
      String funcName = funcSpec[0];
      funcSpec[0] = "";
      setDistanceF((DistanceFunction) Utils.forName(DistanceFunction.class,
                                        funcName, funcSpec));
    }
    else {
      setDistanceF(new EuclideanDistance());
    }

    optionString  = Utils.getOption('N', options);
    if (optionString.length() != 0) {
      setInputCenterFile(new File(optionString));
      m_CenterInput = 
      new BufferedReader(new FileReader(optionString));
    }
    else {
      setInputCenterFile(new File(System.getProperty("user.dir")));
      m_CenterInput = null;
    }

    optionString  = Utils.getOption('O', options);
    if (optionString.length() != 0) {
      setOutputCenterFile(new File(optionString));
      m_CenterOutput = new PrintWriter(new FileOutputStream(optionString));
    }
    else {
      setOutputCenterFile(new File(System.getProperty("user.dir")));
      m_CenterOutput = null;
    }

    optionString = Utils.getOption('U', options);
    int debugLevel = 0;
    if (optionString.length() != 0) {
      try {
      debugLevel = Integer.parseInt(optionString);
      } catch (NumberFormatException e) {
      throw new Exception(optionString +
                            "is an illegal value for option -U"); 
      }
    }
    setDebugLevel(debugLevel);

    optionString  = Utils.getOption('Y', options);
    if (optionString.length() != 0) {
      setDebugVectorsFile(new File(optionString));
    }
    else {
      setDebugVectorsFile(new File(System.getProperty("user.dir")));
      m_DebugVectorsInput = null;
      m_DebugVectors      = null;
    }
    
    super.setOptions(options);
  }
  
  /**
   * Gets the current settings of SimpleKMeans.
   * @return an array of strings suitable for passing to setOptions
   */
02232   public String[] getOptions() {
    int           i;
    Vector        result;
    String[]      options;

    result = new Vector();

    result.add("-I");
    result.add("" + getMaxIterations());
    
    result.add("-M");
    result.add("" + getMaxKMeans());
    
    result.add("-J");
    result.add("" + getMaxKMeansForChildren());
    
    result.add("-L");
    result.add("" + getMinNumClusters());
    
    result.add("-H");
    result.add("" + getMaxNumClusters());
    
    result.add("-B");
    result.add("" + getBinValue());
    
    if (getUseKDTree()) {
      result.add("-use-kdtree");
      result.add("-K");
      result.add("" + getKDTreeSpec());
    }
    
    result.add("-C");
    result.add("" + getCutOffFactor());
    
    if (getDistanceF() != null) {
      result.add("-D");
      result.add("" + getDistanceFSpec());
    }
    
    if (getInputCenterFile().exists() && getInputCenterFile().isFile()) {
      result.add("-N");
      result.add("" + getInputCenterFile());
    }
    
    if (getOutputCenterFile().exists() && getOutputCenterFile().isFile()) {
      result.add("-O");
      result.add("" + getOutputCenterFile());
    }
    
    int dL = getDebugLevel();
    if (dL > 0) {
      result.add("-U");
      result.add("" + getDebugLevel());
    }

    if (getDebugVectorsFile().exists() && getDebugVectorsFile().isFile()) {
      result.add("-Y");
      result.add("" + getDebugVectorsFile());
    }
    
    options = super.getOptions();
    for (i = 0; i < options.length; i++)
      result.add(options[i]);

    return (String[]) result.toArray(new String[result.size()]);    
  }

  /**
   * Return a string describing this clusterer.
   * @return a description of the clusterer as a string
   */
02303   public String toString() {
    StringBuffer temp = new StringBuffer();

    temp.append("\nXMeans\n======\n");

    temp.append("Requested iterations            : " + m_MaxIterations + "\n");
    temp.append("Iterations performed            : " + m_IterationCount+ "\n");
    if (m_KMeansStopped > 0) {
      temp.append("kMeans did not converge\n");
      temp.append("  but was stopped by max-loops " 
        + m_KMeansStopped + " times (max kMeans-iter)\n");
    }
    temp.append("Splits prepared                 : " + m_NumSplits + "\n");
    temp.append("Splits performed                : " + m_NumSplitsDone + "\n");
    temp.append("Cutoff factor                   : " + m_CutOffFactor + "\n");
    double perc;
    if (m_NumSplitsDone > 0)
      perc = (((double)m_NumSplitsStillDone)/((double) m_NumSplitsDone))
             * 100.0;
    else
      perc = 0.0;
    temp.append("Percentage of splits accepted \n" +
                "by cutoff factor                : " 
            + Utils.doubleToString(perc,2) + " %\n");
    temp.append("------\n");

    temp.append("Cutoff factor                   : " + m_CutOffFactor + "\n");
    temp.append("------\n");
    temp.append("\nCluster centers                 : " + m_NumClusters + " centers\n");
    for (int i = 0; i < m_NumClusters; i++) {
      temp.append("\nCluster "+i+"\n           ");
      for (int j = 0; j < m_ClusterCenters.numAttributes(); j++) {
      if (m_ClusterCenters.attribute(j).isNominal()) {
        temp.append(" "+m_ClusterCenters.attribute(j).
                  value((int)m_ClusterCenters.instance(i).value(j)));
      } else {
        temp.append(" "+m_ClusterCenters.instance(i).value(j));
      }
      }
    }
    if (m_Mle != null)
      temp.append("\n\nDistortion: " + 
              Utils.doubleToString(Utils.sum(m_Mle),6) + "\n");
    temp.append("BIC-Value : " + Utils.doubleToString(m_Bic,6) + "\n");
    return temp.toString();
  }

  /**
   * Print centers for debug.
   * @param debugLevel level that gives according messages
   */
02354   protected void PrCentersFD(int debugLevel) {
    if (debugLevel == m_DebugLevel) {
      for (int i = 0; i < m_ClusterCenters.numInstances(); i++) {
      System.out.println(m_ClusterCenters.instance(i));
      }
    }
  }

  /**
   * Tests on debug status.
   * @param debugLevel level that gives according messages
   * @return true if debug level is set
   */
02367   protected boolean TFD(int debugLevel) {
    return (debugLevel == m_DebugLevel);
  }

  /**
   * Does debug printouts.
   * @param debugLevel level that gives according messages
   * @param output string that is printed
   */
02376   protected void PFD(int debugLevel, String output) {
    if (debugLevel == m_DebugLevel)
      System.out.println(output);
  }
  /**
   * Does debug printouts.
   * @param output string that is printed
   */
02384   protected void PFD_CURR(String output) {
    if (m_CurrDebugFlag)
      System.out.println(output);
  }
  
  /**
   * Returns the revision string.
   * 
   * @return            the revision
   */
02394   public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.24 $");
  }

  /**
   * Main method for testing this class.
   * @param argv should contain options 
   */
02402   public static void main(String[] argv) {
    runClusterer(new XMeans(), argv);
  }
}

Generated by  Doxygen 1.6.0   Back to index