Logo Search packages:      
Sourcecode: weka version File versions

Instances.java

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    Instances.java
 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core;

import weka.core.converters.ArffLoader.ArffReader;
import weka.core.converters.ConverterUtils.DataSource;

import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Random;

/**
 * Class for handling an ordered set of weighted instances. <p>
 *
 * Typical usage: <p>
 * <pre>
 * import weka.core.converters.ConverterUtils.DataSource;
 * ...
 * 
 * // Read all the instances in the file (ARFF, CSV, XRFF, ...)
 * DataSource source = new DataSource(filename);
 * Instances instances = source.getDataSet();
 *
 * // Make the last attribute be the class
 * instances.setClassIndex(instances.numAttributes() - 1);
 * 
 * // Print header and instances.
 * System.out.println("\nDataset:\n");
 * System.out.println(instances);
 * 
 * ...
 * </pre><p>
 *
 * All methods that change a set of instances are safe, ie. a change
 * of a set of instances does not affect any other sets of
 * instances. All methods that change a datasets's attribute
 * information clone the dataset before it is changed.
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @author Len Trigg (trigg@cs.waikato.ac.nz)
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 1.76 $ 
 */
00067 public class Instances 
  implements Serializable, RevisionHandler {
  
  /** for serialization */
00071   static final long serialVersionUID = -19412345060742748L;
  
  /** The filename extension that should be used for arff files */
00074   public final static String FILE_EXTENSION = ".arff";

  /** The filename extension that should be used for bin. serialized instances files */
00077   public final static String SERIALIZED_OBJ_FILE_EXTENSION = ".bsi";

  /** The keyword used to denote the start of an arff header */
00080   public final static String ARFF_RELATION = "@relation";

  /** The keyword used to denote the start of the arff data section */
00083   public final static String ARFF_DATA = "@data";

  /** The dataset's name. */
00086   protected /*@spec_public non_null@*/ String m_RelationName;         

  /** The attribute information. */
00089   protected /*@spec_public non_null@*/ FastVector m_Attributes;
  /*  public invariant (\forall int i; 0 <= i && i < m_Attributes.size(); 
                    m_Attributes.elementAt(i) != null);
  */

  /** The instances. */
00095   protected /*@spec_public non_null@*/ FastVector m_Instances;

  /** The class attribute's index */
00098   protected int m_ClassIndex;
  //@ protected invariant classIndex() == m_ClassIndex;

  /** The lines read so far in case of incremental loading. Since the 
   * StreamTokenizer will be re-initialized with every instance that is read,
   * we have to keep track of the number of lines read so far. 
   * @see #readInstance(Reader) */
00105   protected int m_Lines = 0;
  
  /** used in randomizeAttribute and undoRandomizeAttribute to store/restore
   * the index of attribute that was last shuffled, and it's original values 
   */
00110   private int attIdx4Randomization = -1;
  private double[] attIdxOrigValues;
  
  /**
   * Reads an ARFF file from a reader, and assigns a weight of
   * one to each instance. Lets the index of the class 
   * attribute be undefined (negative).
   *
   * @param reader the reader
   * @throws IOException if the ARFF file is not read 
   * successfully
   */
00122   public Instances(/*@non_null@*/Reader reader) throws IOException {
    ArffReader arff = new ArffReader(reader);
    Instances dataset = arff.getData();
    initialize(dataset, dataset.numInstances());
    dataset.copyInstances(0, this, dataset.numInstances());
    compactify();
  }
 
  /**
   * Reads the header of an ARFF file from a reader and 
   * reserves space for the given number of instances. Lets
   * the class index be undefined (negative).
   *
   * @param reader the reader
   * @param capacity the capacity
   * @throws IllegalArgumentException if the header is not read successfully
   * or the capacity is negative.
   * @throws IOException if there is a problem with the reader.
   * @deprecated instead of using this method in conjunction with the
   * <code>readInstance(Reader)</code> method, one should use the 
   * <code>ArffLoader</code> or <code>DataSource</code> class instead.
   * @see weka.core.converters.ArffLoader
   * @see weka.core.converters.ConverterUtils.DataSource
   */
  //@ requires capacity >= 0;
  //@ ensures classIndex() == -1;
00148   @Deprecated public Instances(/*@non_null@*/Reader reader, int capacity)
    throws IOException {

    ArffReader arff = new ArffReader(reader, 0);
    Instances header = arff.getStructure();
    initialize(header, capacity);
    m_Lines = arff.getLineNo();
  }

  /**
   * Constructor copying all instances and references to
   * the header information from the given set of instances.
   *
   * @param dataset the set to be copied
   */
00163   public Instances(/*@non_null@*/Instances dataset) {

    this(dataset, dataset.numInstances());

    dataset.copyInstances(0, this, dataset.numInstances());
  }

  /**
   * Constructor creating an empty set of instances. Copies references
   * to the header information from the given set of instances. Sets
   * the capacity of the set of instances to 0 if its negative.
   *
   * @param dataset the instances from which the header 
   * information is to be taken
   * @param capacity the capacity of the new dataset 
   */
00179   public Instances(/*@non_null@*/Instances dataset, int capacity) {
    initialize(dataset, capacity);
  }

  /**
   * initializes with the header information of the given dataset and sets
   * the capacity of the set of instances.
   * 
   * @param dataset the dataset to use as template
   * @param capacity the number of rows to reserve
   */
00190   protected void initialize(Instances dataset, int capacity) {
    if (capacity < 0)
      capacity = 0;
    
    // Strings only have to be "shallow" copied because
    // they can't be modified.
    m_ClassIndex   = dataset.m_ClassIndex;
    m_RelationName = dataset.m_RelationName;
    m_Attributes   = dataset.m_Attributes;
    m_Instances    = new FastVector(capacity);
  }
  
  /**
   * Creates a new set of instances by copying a 
   * subset of another set.
   *
   * @param source the set of instances from which a subset 
   * is to be created
   * @param first the index of the first instance to be copied
   * @param toCopy the number of instances to be copied
   * @throws IllegalArgumentException if first and toCopy are out of range
   */
  //@ requires 0 <= first;
  //@ requires 0 <= toCopy;
  //@ requires first + toCopy <= source.numInstances();
00215   public Instances(/*@non_null@*/Instances source, int first, int toCopy) {
    
    this(source, toCopy);

    if ((first < 0) || ((first + toCopy) > source.numInstances())) {
      throw new IllegalArgumentException("Parameters first and/or toCopy out "+
                                         "of range");
    }
    source.copyInstances(first, this, toCopy);
  }

  /**
   * Creates an empty set of instances. Uses the given
   * attribute information. Sets the capacity of the set of 
   * instances to 0 if its negative. Given attribute information
   * must not be changed after this constructor has been used.
   *
   * @param name the name of the relation
   * @param attInfo the attribute information
   * @param capacity the capacity of the set
   */
00236   public Instances(/*@non_null@*/String name, 
               /*@non_null@*/FastVector attInfo, int capacity) {

    m_RelationName = name;
    m_ClassIndex = -1;
    m_Attributes = attInfo;
    for (int i = 0; i < numAttributes(); i++) {
      attribute(i).setIndex(i);
    }
    m_Instances = new FastVector(capacity);
  }
 

  /**
   * Create a copy of the structure if the data has string or
   * relational attributes, "cleanses" string types (i.e. doesn't
   * contain references to the strings seen in the past) and all
   * relational attributes.
   *
   * @return a copy of the instance structure.
   */
00257   public Instances stringFreeStructure() {

    FastVector newAtts = new FastVector();
    for (int i = 0 ; i < m_Attributes.size(); i++) {
      Attribute att = (Attribute)m_Attributes.elementAt(i);
      if (att.type() == Attribute.STRING) {
        newAtts.addElement(new Attribute(att.name(), (FastVector)null, i));
      } else if (att.type() == Attribute.RELATIONAL) {
        newAtts.addElement(new Attribute(att.name(), new Instances(att.relation(), 0), i));
      }
    }
    if (newAtts.size() == 0) {
      return new Instances(this, 0);
    }
    FastVector atts = (FastVector)m_Attributes.copy();
    for (int i = 0; i < newAtts.size(); i++) {
      atts.setElementAt(newAtts.elementAt(i), ((Attribute)newAtts.elementAt(i)).index());
    }
    Instances result = new Instances(this, 0);
    result.m_Attributes = atts;
    return result;
  }

  /**
   * Adds one instance to the end of the set. 
   * Shallow copies instance before it is added. Increases the
   * size of the dataset if it is not large enough. Does not
   * check if the instance is compatible with the dataset.
   * Note: String or relational values are not transferred.
   *
   * @param instance the instance to be added
   */
00289   public void add(/*@non_null@*/ Instance instance) {

    Instance newInstance = (Instance)instance.copy();

    newInstance.setDataset(this);
    m_Instances.addElement(newInstance);
  }

  /**
   * Returns an attribute.
   *
   * @param index the attribute's index (index starts with 0)
   * @return the attribute at the given position
   */
  //@ requires 0 <= index;
  //@ requires index < m_Attributes.size();
  //@ ensures \result != null;
00306   public /*@pure@*/ Attribute attribute(int index) {
    
    return (Attribute) m_Attributes.elementAt(index);
  }

  /**
   * Returns an attribute given its name. If there is more than
   * one attribute with the same name, it returns the first one.
   * Returns null if the attribute can't be found.
   *
   * @param name the attribute's name
   * @return the attribute with the given name, null if the
   * attribute can't be found
   */ 
00320   public /*@pure@*/ Attribute attribute(String name) {
    
    for (int i = 0; i < numAttributes(); i++) {
      if (attribute(i).name().equals(name)) {
      return attribute(i);
      }
    }
    return null;
  }

  /**
   * Checks for attributes of the given type in the dataset
   *
   * @param attType  the attribute type to look for
   * @return         true if attributes of the given type are present
   */
00336   public boolean checkForAttributeType(int attType) {
    
    int i = 0;
    
    while (i < m_Attributes.size()) {
      if (attribute(i++).type() == attType) {
        return true;
      }
    }
    return false;
  }

  /**
   * Checks for string attributes in the dataset
   *
   * @return true if string attributes are present, false otherwise
   */
00353   public /*@pure@*/ boolean checkForStringAttributes() {
    return checkForAttributeType(Attribute.STRING);
  }

  /**
   * Checks if the given instance is compatible
   * with this dataset. Only looks at the size of
   * the instance and the ranges of the values for 
   * nominal and string attributes.
   *
   * @param instance the instance to check
   * @return true if the instance is compatible with the dataset 
   */
00366   public /*@pure@*/ boolean checkInstance(Instance instance) {

    if (instance.numAttributes() != numAttributes()) {
      return false;
    }
    for (int i = 0; i < numAttributes(); i++) {
      if (instance.isMissing(i)) {
      continue;
      } else if (attribute(i).isNominal() ||
             attribute(i).isString()) {
      if (!(Utils.eq(instance.value(i),
                   (double)(int)instance.value(i)))) {
        return false;
      } else if (Utils.sm(instance.value(i), 0) ||
               Utils.gr(instance.value(i),
                      attribute(i).numValues())) {
        return false;
      }
      }
    }
    return true;
  }
      
  /**
   * Returns the class attribute.
   *
   * @return the class attribute
   * @throws UnassignedClassException if the class is not set
   */
  //@ requires classIndex() >= 0;
00396   public /*@pure@*/ Attribute classAttribute() {

    if (m_ClassIndex < 0) {
      throw new UnassignedClassException("Class index is negative (not set)!");
    }
    return attribute(m_ClassIndex);
  }

  /**
   * Returns the class attribute's index. Returns negative number
   * if it's undefined.
   *
   * @return the class index as an integer
   */
  // ensures \result == m_ClassIndex;
00411   public /*@pure@*/ int classIndex() {
    
    return m_ClassIndex;
  }
 
  /**
   * Compactifies the set of instances. Decreases the capacity of
   * the set so that it matches the number of instances in the set.
   */
00420   public void compactify() {

    m_Instances.trimToSize();
  }

  /**
   * Removes all instances from the set.
   */
00428   public void delete() {
    
    m_Instances = new FastVector();
  }

  /**
   * Removes an instance at the given position from the set.
   *
   * @param index the instance's position (index starts with 0)
   */
  //@ requires 0 <= index && index < numInstances();
00439   public void delete(int index) {
    
    m_Instances.removeElementAt(index);
  }

  /**
   * Deletes an attribute at the given position 
   * (0 to numAttributes() - 1). A deep copy of the attribute
   * information is performed before the attribute is deleted.
   *
   * @param position the attribute's position (position starts with 0)
   * @throws IllegalArgumentException if the given index is out of range 
   *            or the class attribute is being deleted
   */
  //@ requires 0 <= position && position < numAttributes();
  //@ requires position != classIndex();
00455   public void deleteAttributeAt(int position) {
       
    if ((position < 0) || (position >= m_Attributes.size())) {
      throw new IllegalArgumentException("Index out of range");
    }
    if (position == m_ClassIndex) {
      throw new IllegalArgumentException("Can't delete class attribute");
    }
    freshAttributeInfo();
    if (m_ClassIndex > position) {
      m_ClassIndex--;
    }
    m_Attributes.removeElementAt(position);
    for (int i = position; i < m_Attributes.size(); i++) {
      Attribute current = (Attribute)m_Attributes.elementAt(i);
      current.setIndex(current.index() - 1);
    }
    for (int i = 0; i < numInstances(); i++) {
      instance(i).forceDeleteAttributeAt(position); 
    }
  }

  /**
   * Deletes all attributes of the given type in the dataset. A deep copy of 
   * the attribute information is performed before an attribute is deleted.
   *
   * @param attType the attribute type to delete
   * @throws IllegalArgumentException if attribute couldn't be 
   * successfully deleted (probably because it is the class attribute).
   */
00485   public void deleteAttributeType(int attType) {
    int i = 0;
    while (i < m_Attributes.size()) {
      if (attribute(i).type() == attType) {
        deleteAttributeAt(i);
      } else {
        i++;
      }
    }
  }

  /**
   * Deletes all string attributes in the dataset. A deep copy of the attribute
   * information is performed before an attribute is deleted.
   *
   * @throws IllegalArgumentException if string attribute couldn't be 
   * successfully deleted (probably because it is the class attribute).
   * @see #deleteAttributeType(int)
   */
00504   public void deleteStringAttributes() {
    deleteAttributeType(Attribute.STRING);
  }

  /**
   * Removes all instances with missing values for a particular
   * attribute from the dataset.
   *
   * @param attIndex the attribute's index (index starts with 0)
   */
  //@ requires 0 <= attIndex && attIndex < numAttributes();
00515   public void deleteWithMissing(int attIndex) {

    FastVector newInstances = new FastVector(numInstances());

    for (int i = 0; i < numInstances(); i++) {
      if (!instance(i).isMissing(attIndex)) {
      newInstances.addElement(instance(i));
      }
    }
    m_Instances = newInstances;
  }

  /**
   * Removes all instances with missing values for a particular
   * attribute from the dataset.
   *
   * @param att the attribute
   */
00533   public void deleteWithMissing(/*@non_null@*/ Attribute att) {

    deleteWithMissing(att.index());
  }

  /**
   * Removes all instances with a missing class value
   * from the dataset.
   *
   * @throws UnassignedClassException if class is not set
   */
00544   public void deleteWithMissingClass() {

    if (m_ClassIndex < 0) {
      throw new UnassignedClassException("Class index is negative (not set)!");
    }
    deleteWithMissing(m_ClassIndex);
  }

  /**
   * Returns an enumeration of all the attributes.
   *
   * @return enumeration of all the attributes.
   */
00557   public /*@non_null pure@*/ Enumeration enumerateAttributes() {

    return m_Attributes.elements(m_ClassIndex);
  }

  /**
   * Returns an enumeration of all instances in the dataset.
   *
   * @return enumeration of all instances in the dataset
   */
00567   public /*@non_null pure@*/ Enumeration enumerateInstances() {

    return m_Instances.elements();
  }

  /**
   * Checks if two headers are equivalent.
   *
   * @param dataset another dataset
   * @return true if the header of the given dataset is equivalent 
   * to this header
   */
00579   public /*@pure@*/ boolean equalHeaders(Instances dataset){

    // Check class and all attributes
    if (m_ClassIndex != dataset.m_ClassIndex) {
      return false;
    }
    if (m_Attributes.size() != dataset.m_Attributes.size()) {
      return false;
    }
    for (int i = 0; i < m_Attributes.size(); i++) {
      if (!(attribute(i).equals(dataset.attribute(i)))) {
      return false;
      }
    }
    return true;
  }
 
  /**
   * Returns the first instance in the set.
   *
   * @return the first instance in the set
   */
  //@ requires numInstances() > 0;
00602   public /*@non_null pure@*/ Instance firstInstance() {
    
    return (Instance)m_Instances.firstElement();
  }

  /**
   * Returns a random number generator. The initial seed of the random
   * number generator depends on the given seed and the hash code of
   * a string representation of a instances chosen based on the given
   * seed. 
   *
   * @param seed the given seed
   * @return the random number generator
   */
00616   public Random getRandomNumberGenerator(long seed) {

    Random r = new Random(seed);
    r.setSeed(instance(r.nextInt(numInstances())).toString().hashCode() + seed);
    return r;
  }
 
  /**
   * Inserts an attribute at the given position (0 to 
   * numAttributes()) and sets all values to be missing.
   * Shallow copies the attribute before it is inserted, and performs
   * a deep copy of the existing attribute information.
   *
   * @param att the attribute to be inserted
   * @param position the attribute's position (position starts with 0)
   * @throws IllegalArgumentException if the given index is out of range
   */
  //@ requires 0 <= position;
  //@ requires position <= numAttributes();
00635   public void insertAttributeAt(/*@non_null@*/ Attribute att, int position) {
       
    if ((position < 0) ||
      (position > m_Attributes.size())) {
      throw new IllegalArgumentException("Index out of range");
    }
    att = (Attribute)att.copy();
    freshAttributeInfo();
    att.setIndex(position);
    m_Attributes.insertElementAt(att, position);
    for (int i = position + 1; i < m_Attributes.size(); i++) {
      Attribute current = (Attribute)m_Attributes.elementAt(i);
      current.setIndex(current.index() + 1);
    }
    for (int i = 0; i < numInstances(); i++) {
      instance(i).forceInsertAttributeAt(position);
    }
    if (m_ClassIndex >= position) {
      m_ClassIndex++;
    }
  }

  /**
   * Returns the instance at the given position.
   *
   * @param index the instance's index (index starts with 0)
   * @return the instance at the given position
   */
  //@ requires 0 <= index;
  //@ requires index < numInstances();
00665   public /*@non_null pure@*/ Instance instance(int index) {

    return (Instance)m_Instances.elementAt(index);
  }

  /**
   * Returns the kth-smallest attribute value of a numeric attribute.
   * Note that calling this method will change the order of the data!
   *
   * @param att the Attribute object
   * @param k the value of k
   * @return the kth-smallest value
   */
00678   public double kthSmallestValue(Attribute att, int k) {

    return kthSmallestValue(att.index(), k);
  }

  /**
   * Returns the kth-smallest attribute value of a numeric attribute.
   * Note that calling this method will change the order of the data!
   * The number of non-missing values in the data must be as least
   * as last as k for this to work.
   *
   * @param attIndex the attribute's index
   * @param k the value of k
   * @return the kth-smallest value
   */
00693   public double kthSmallestValue(int attIndex, int k) {
    
    if (!attribute(attIndex).isNumeric()) {
      throw new IllegalArgumentException("Instances: attribute must be numeric to compute kth-smallest value.");
    }

    int i,j;

    // move all instances with missing values to end
    j = numInstances() - 1;
    i = 0;
    while (i <= j) {
      if (instance(j).isMissing(attIndex)) {
      j--;
      } else {
      if (instance(i).isMissing(attIndex)) {
        swap(i,j);
        j--;
      }
      i++;
      }
    }

    if ((k < 1) || (k > j+1)) {
      throw new IllegalArgumentException("Instances: value for k for computing kth-smallest value too large.");
    }

    return instance(select(attIndex, 0, j, k)).value(attIndex);
  }

  /**
   * Returns the last instance in the set.
   *
   * @return the last instance in the set
   */
  //@ requires numInstances() > 0;
00729   public /*@non_null pure@*/ Instance lastInstance() {
    
    return (Instance)m_Instances.lastElement();
  }

  /**
   * Returns the mean (mode) for a numeric (nominal) attribute as
   * a floating-point value. Returns 0 if the attribute is neither nominal nor 
   * numeric. If all values are missing it returns zero.
   *
   * @param attIndex the attribute's index (index starts with 0)
   * @return the mean or the mode
   */
00742   public /*@pure@*/ double meanOrMode(int attIndex) {

    double result, found;
    int [] counts;

    if (attribute(attIndex).isNumeric()) {
      result = found = 0;
      for (int j = 0; j < numInstances(); j++) {
      if (!instance(j).isMissing(attIndex)) {
        found += instance(j).weight();
        result += instance(j).weight()*instance(j).value(attIndex);
      }
      }
      if (found <= 0) {
      return 0;
      } else {
      return result / found;
      }
    } else if (attribute(attIndex).isNominal()) {
      counts = new int[attribute(attIndex).numValues()];
      for (int j = 0; j < numInstances(); j++) {
      if (!instance(j).isMissing(attIndex)) {
        counts[(int) instance(j).value(attIndex)] += instance(j).weight();
      }
      }
      return (double)Utils.maxIndex(counts);
    } else {
      return 0;
    }
  }

  /**
   * Returns the mean (mode) for a numeric (nominal) attribute as a
   * floating-point value.  Returns 0 if the attribute is neither
   * nominal nor numeric.  If all values are missing it returns zero.
   *
   * @param att the attribute
   * @return the mean or the mode 
   */
00781   public /*@pure@*/ double meanOrMode(Attribute att) {

    return meanOrMode(att.index());
  }

  /**
   * Returns the number of attributes.
   *
   * @return the number of attributes as an integer
   */
  //@ ensures \result == m_Attributes.size();
00792   public /*@pure@*/ int numAttributes() {

    return m_Attributes.size();
  }

  /**
   * Returns the number of class labels.
   *
   * @return the number of class labels as an integer if the class 
   * attribute is nominal, 1 otherwise.
   * @throws UnassignedClassException if the class is not set
   */
  //@ requires classIndex() >= 0;
00805   public /*@pure@*/ int numClasses() {
    
    if (m_ClassIndex < 0) {
      throw new UnassignedClassException("Class index is negative (not set)!");
    }
    if (!classAttribute().isNominal()) {
      return 1;
    } else {
      return classAttribute().numValues();
    }
  }

  /**
   * Returns the number of distinct values of a given attribute.
   * Returns the number of instances if the attribute is a
   * string attribute. The value 'missing' is not counted.
   *
   * @param attIndex the attribute (index starts with 0)
   * @return the number of distinct values of a given attribute
   */
  //@ requires 0 <= attIndex;
  //@ requires attIndex < numAttributes();
00827   public /*@pure@*/ int numDistinctValues(int attIndex) {

    if (attribute(attIndex).isNumeric()) {
      double [] attVals = attributeToDoubleArray(attIndex);
      int [] sorted = Utils.sort(attVals);
      double prev = 0;
      int counter = 0;
      for (int i = 0; i < sorted.length; i++) {
      Instance current = instance(sorted[i]);
      if (current.isMissing(attIndex)) {
        break;
      }
      if ((i == 0) || 
          (current.value(attIndex) > prev)) {
        prev = current.value(attIndex);
        counter++;
      }
      }
      return counter;
    } else {
      return attribute(attIndex).numValues();
    }
  }

  /**
   * Returns the number of distinct values of a given attribute.
   * Returns the number of instances if the attribute is a
   * string attribute. The value 'missing' is not counted.
   *
   * @param att the attribute
   * @return the number of distinct values of a given attribute
   */
00859   public /*@pure@*/ int numDistinctValues(/*@non_null@*/Attribute att) {

    return numDistinctValues(att.index());
  }
  
  /**
   * Returns the number of instances in the dataset.
   *
   * @return the number of instances in the dataset as an integer
   */
  //@ ensures \result == m_Instances.size();
00870   public /*@pure@*/ int numInstances() {

    return m_Instances.size();
  }

  /**
   * Shuffles the instances in the set so that they are ordered 
   * randomly.
   *
   * @param random a random number generator
   */
00881   public void randomize(Random random) {

    for (int j = numInstances() - 1; j > 0; j--)
      swap(j, random.nextInt(j+1));
  }

  /**
   * Does an undo of a previous call to randomizeAttribute, so that the
   * original values of the attribute are restored. Only the original values
   * before the last call to randomizeAttribute can be restored. Note, the
   * original Instances object is modified. 
   * 
   * @throws Exception if there was no call to randomizeAttribute or if
   * attributes were added or removed since the last call to
   * <code>randomizeAttribute</code>
   * @see randomizeAttribute
   * @author Arne Muller (arne.muller@gmail.com)
   */
00899   public void undoRandomizeAttribute() throws Exception {
    
    if ( attIdx4Randomization < 0 )
      throw new Exception("no randomization to undo!");

    if ( attIdxOrigValues.length !=  this.numInstances() )
      throw new Exception(
        "meanwhile number of attributes has changed, can't undo!");

    for ( int i=0; i<attIdxOrigValues.length; i++ ) {
      Instance instance = (Instance)(m_Instances.elementAt(i));
      instance.modifyValue(attIdx4Randomization, attIdxOrigValues[i]);
    }
  
    return;
  }
  
  /**
   * Shuffles the values of a given attribute in all instances. Note,
   * the original Instances object is modified (i.e. no copying), and the
   * method is not thread save. To avoid undefined behavior of an Instances
   * object you should not perform other Instances modifying operations between
   * a call to <code>randomizeAttribute</code> and
   * <code>undoRandomizeAttribute</code>.
   * 
   * @param attIdx the index of the attribute to shuffle
   * @param random a random number generator
   * @param rounds how many rounds of shuffling, minimum must be 1. As more
   * rounds of shuffling the more random your attribute value distribution
   * (e.g. choose 3, but note that the time needed for shuffling is proportional
   * to the number of rounds).
   * @see undoRandomizeAttribute
   * @author Arne Muller (arne.muller@gmail.com)
   */
00933   public void randomizeAttribute(int attIdx, Random random, int rounds) {
    
    attIdx4Randomization = attIdx;
    attIdxOrigValues = this.attributeToDoubleArray(attIdx);
    int n = numInstances();
    for ( int j=0; j<rounds; j++ ) {
      for ( int i=0; i<n; i++ ) {
      int r = random.nextInt(n);
      Instance iOne = (Instance)(m_Instances.elementAt(i));
      Instance iTwo = (Instance)(m_Instances.elementAt(r));
      double helper = iOne.value(attIdx);
      iOne.modifyValue(attIdx, iTwo.value(attIdx));
      iTwo.modifyValue(attIdx, helper);
      }
    }
    
    return;
  }
  
  /**
   * Reads a single instance from the reader and appends it
   * to the dataset.  Automatically expands the dataset if it
   * is not large enough to hold the instance. This method does
   * not check for carriage return at the end of the line.
   *
   * @param reader the reader 
   * @return false if end of file has been reached
   * @throws IOException if the information is not read 
   * successfully
   * @deprecated instead of using this method in conjunction with the
   * <code>readInstance(Reader)</code> method, one should use the 
   * <code>ArffLoader</code> or <code>DataSource</code> class instead.
   * @see weka.core.converters.ArffLoader
   * @see weka.core.converters.ConverterUtils.DataSource
   */ 
00968   @Deprecated public boolean readInstance(Reader reader) throws IOException {

    ArffReader arff = new ArffReader(reader, this, m_Lines, 1);
    Instance inst = arff.readInstance(arff.getData(), false);
    m_Lines = arff.getLineNo();
    if (inst != null) {
      add(inst);
      return true;
    }
    else {
      return false;
    }
  }    

  /**
   * Returns the relation's name.
   *
   * @return the relation's name as a string
   */
  //@ ensures \result == m_RelationName;
00988   public /*@pure@*/ String relationName() {

    return m_RelationName;
  }

  /**
   * Renames an attribute. This change only affects this
   * dataset.
   *
   * @param att the attribute's index (index starts with 0)
   * @param name the new name
   */
01000   public void renameAttribute(int att, String name) {

    Attribute newAtt = attribute(att).copy(name);
    FastVector newVec = new FastVector(numAttributes());

    for (int i = 0; i < numAttributes(); i++) {
      if (i == att) {
      newVec.addElement(newAtt);
      } else {
      newVec.addElement(attribute(i));
      }
    }
    m_Attributes = newVec;
  }

  /**
   * Renames an attribute. This change only affects this
   * dataset.
   *
   * @param att the attribute
   * @param name the new name
   */
01022   public void renameAttribute(Attribute att, String name) {

    renameAttribute(att.index(), name);
  }

  /**
   * Renames the value of a nominal (or string) attribute value. This
   * change only affects this dataset.
   *
   * @param att the attribute's index (index starts with 0)
   * @param val the value's index (index starts with 0)
   * @param name the new name 
   */
01035   public void renameAttributeValue(int att, int val, String name) {

    Attribute newAtt = (Attribute)attribute(att).copy();
    FastVector newVec = new FastVector(numAttributes());

    newAtt.setValue(val, name);
    for (int i = 0; i < numAttributes(); i++) {
      if (i == att) {
      newVec.addElement(newAtt);
      } else {
      newVec.addElement(attribute(i));
      }
    }
    m_Attributes = newVec;
  }

  /**
   * Renames the value of a nominal (or string) attribute value. This
   * change only affects this dataset.
   *
   * @param att the attribute
   * @param val the value
   * @param name the new name
   */
01059   public void renameAttributeValue(Attribute att, String val, 
                                         String name) {

    int v = att.indexOfValue(val);
    if (v == -1) throw new IllegalArgumentException(val + " not found");
    renameAttributeValue(att.index(), v, name);
  }

  /**
   * Creates a new dataset of the same size using random sampling
   * with replacement.
   *
   * @param random a random number generator
   * @return the new dataset
   */
01074   public Instances resample(Random random) {

    Instances newData = new Instances(this, numInstances());
    while (newData.numInstances() < numInstances()) {
      newData.add(instance(random.nextInt(numInstances())));
    }
    return newData;
  }

  /**
   * Creates a new dataset of the same size using random sampling
   * with replacement according to the current instance weights. The
   * weights of the instances in the new dataset are set to one.
   *
   * @param random a random number generator
   * @return the new dataset
   */
01091   public Instances resampleWithWeights(Random random) {

    double [] weights = new double[numInstances()];
    for (int i = 0; i < weights.length; i++) {
      weights[i] = instance(i).weight();
    }
    return resampleWithWeights(random, weights);
  }


  /**
   * Creates a new dataset of the same size using random sampling
   * with replacement according to the given weight vector. The
   * weights of the instances in the new dataset are set to one.
   * The length of the weight vector has to be the same as the
   * number of instances in the dataset, and all weights have to
   * be positive.
   *
   * @param random a random number generator
   * @param weights the weight vector
   * @return the new dataset
   * @throws IllegalArgumentException if the weights array is of the wrong
   * length or contains negative weights.
   */
01115   public Instances resampleWithWeights(Random random, 
                                   double[] weights) {

    if (weights.length != numInstances()) {
      throw new IllegalArgumentException("weights.length != numInstances.");
    }
    Instances newData = new Instances(this, numInstances());
    if (numInstances() == 0) {
      return newData;
    }
    double[] probabilities = new double[numInstances()];
    double sumProbs = 0, sumOfWeights = Utils.sum(weights);
    for (int i = 0; i < numInstances(); i++) {
      sumProbs += random.nextDouble();
      probabilities[i] = sumProbs;
    }
    Utils.normalize(probabilities, sumProbs / sumOfWeights);

    // Make sure that rounding errors don't mess things up
    probabilities[numInstances() - 1] = sumOfWeights;
    int k = 0; int l = 0;
    sumProbs = 0;
    while ((k < numInstances() && (l < numInstances()))) {
      if (weights[l] < 0) {
      throw new IllegalArgumentException("Weights have to be positive.");
      }
      sumProbs += weights[l];
      while ((k < numInstances()) &&
           (probabilities[k] <= sumProbs)) { 
      newData.add(instance(l));
      newData.instance(k).setWeight(1);
      k++;
      }
      l++;
    }
    return newData;
  }

  /** 
   * Sets the class attribute.
   *
   * @param att attribute to be the class
   */
01158   public void setClass(Attribute att) {

    m_ClassIndex = att.index();
  }

  /** 
   * Sets the class index of the set.
   * If the class index is negative there is assumed to be no class.
   * (ie. it is undefined)
   *
   * @param classIndex the new class index (index starts with 0)
   * @throws IllegalArgumentException if the class index is too big or < 0
   */
01171   public void setClassIndex(int classIndex) {

    if (classIndex >= numAttributes()) {
      throw new IllegalArgumentException("Invalid class index: " + classIndex);
    }
    m_ClassIndex = classIndex;
  }

  /**
   * Sets the relation's name.
   *
   * @param newName the new relation name.
   */
01184   public void setRelationName(/*@non_null@*/String newName) {
    
    m_RelationName = newName;
  }

  /**
   * Sorts the instances based on an attribute. For numeric attributes, 
   * instances are sorted in ascending order. For nominal attributes, 
   * instances are sorted based on the attribute label ordering 
   * specified in the header. Instances with missing values for the 
   * attribute are placed at the end of the dataset.
   *
   * @param attIndex the attribute's index (index starts with 0)
   */
01198   public void sort(int attIndex) {

    int i,j;

    // move all instances with missing values to end
    j = numInstances() - 1;
    i = 0;
    while (i <= j) {
      if (instance(j).isMissing(attIndex)) {
      j--;
      } else {
      if (instance(i).isMissing(attIndex)) {
        swap(i,j);
        j--;
      }
      i++;
      }
    }
    quickSort(attIndex, 0, j);
  }

  /**
   * Sorts the instances based on an attribute. For numeric attributes, 
   * instances are sorted into ascending order. For nominal attributes, 
   * instances are sorted based on the attribute label ordering 
   * specified in the header. Instances with missing values for the 
   * attribute are placed at the end of the dataset.
   *
   * @param att the attribute
   */
01228   public void sort(Attribute att) {

    sort(att.index());
  }

  /**
   * Stratifies a set of instances according to its class values 
   * if the class attribute is nominal (so that afterwards a 
   * stratified cross-validation can be performed).
   *
   * @param numFolds the number of folds in the cross-validation
   * @throws UnassignedClassException if the class is not set
   */
01241   public void stratify(int numFolds) {
    
    if (numFolds <= 0) {
      throw new IllegalArgumentException("Number of folds must be greater than 1");
    }
    if (m_ClassIndex < 0) {
      throw new UnassignedClassException("Class index is negative (not set)!");
    }
    if (classAttribute().isNominal()) {

      // sort by class
      int index = 1;
      while (index < numInstances()) {
      Instance instance1 = instance(index - 1);
      for (int j = index; j < numInstances(); j++) {
        Instance instance2 = instance(j);
        if ((instance1.classValue() == instance2.classValue()) ||
            (instance1.classIsMissing() && 
             instance2.classIsMissing())) {
          swap(index,j);
          index++;
        }
      }
      index++;
      }
      stratStep(numFolds);
    }
  }
 
  /**
   * Computes the sum of all the instances' weights.
   *
   * @return the sum of all the instances' weights as a double
   */
01275   public /*@pure@*/ double sumOfWeights() {
    
    double sum = 0;

    for (int i = 0; i < numInstances(); i++) {
      sum += instance(i).weight();
    }
    return sum;
  }

  /**
   * Creates the test set for one fold of a cross-validation on 
   * the dataset.
   *
   * @param numFolds the number of folds in the cross-validation. Must
   * be greater than 1.
   * @param numFold 0 for the first fold, 1 for the second, ...
   * @return the test set as a set of weighted instances
   * @throws IllegalArgumentException if the number of folds is less than 2
   * or greater than the number of instances.
   */
  //@ requires 2 <= numFolds && numFolds < numInstances();
  //@ requires 0 <= numFold && numFold < numFolds;
01298   public Instances testCV(int numFolds, int numFold) {

    int numInstForFold, first, offset;
    Instances test;
    
    if (numFolds < 2) {
      throw new IllegalArgumentException("Number of folds must be at least 2!");
    }
    if (numFolds > numInstances()) {
      throw new IllegalArgumentException("Can't have more folds than instances!");
    }
    numInstForFold = numInstances() / numFolds;
    if (numFold < numInstances() % numFolds){
      numInstForFold++;
      offset = numFold;
    }else
      offset = numInstances() % numFolds;
    test = new Instances(this, numInstForFold);
    first = numFold * (numInstances() / numFolds) + offset;
    copyInstances(first, test, numInstForFold);
    return test;
  }
 
  /**
   * Returns the dataset as a string in ARFF format. Strings
   * are quoted if they contain whitespace characters, or if they
   * are a question mark.
   *
   * @return the dataset in ARFF format as a string
   */
01328   public String toString() {
    
    StringBuffer text = new StringBuffer();
    
    text.append(ARFF_RELATION).append(" ").
      append(Utils.quote(m_RelationName)).append("\n\n");
    for (int i = 0; i < numAttributes(); i++) {
      text.append(attribute(i)).append("\n");
    }
    text.append("\n").append(ARFF_DATA).append("\n");

    text.append(stringWithoutHeader());
    return text.toString();
  }

  /**
   * Returns the instances in the dataset as a string in ARFF format. Strings
   * are quoted if they contain whitespace characters, or if they
   * are a question mark.
   *
   * @return the dataset in ARFF format as a string
   */
01350   protected String stringWithoutHeader() {
    
    StringBuffer text = new StringBuffer();

    for (int i = 0; i < numInstances(); i++) {
      text.append(instance(i));
      if (i < numInstances() - 1) {
      text.append('\n');
      }
    }
    return text.toString();
  }

  /**
   * Creates the training set for one fold of a cross-validation 
   * on the dataset. 
   *
   * @param numFolds the number of folds in the cross-validation. Must
   * be greater than 1.
   * @param numFold 0 for the first fold, 1 for the second, ...
   * @return the training set 
   * @throws IllegalArgumentException if the number of folds is less than 2
   * or greater than the number of instances.
   */
  //@ requires 2 <= numFolds && numFolds < numInstances();
  //@ requires 0 <= numFold && numFold < numFolds;
01376   public Instances trainCV(int numFolds, int numFold) {

    int numInstForFold, first, offset;
    Instances train;
 
    if (numFolds < 2) {
      throw new IllegalArgumentException("Number of folds must be at least 2!");
    }
    if (numFolds > numInstances()) {
      throw new IllegalArgumentException("Can't have more folds than instances!");
    }
    numInstForFold = numInstances() / numFolds;
    if (numFold < numInstances() % numFolds) {
      numInstForFold++;
      offset = numFold;
    }else
      offset = numInstances() % numFolds;
    train = new Instances(this, numInstances() - numInstForFold);
    first = numFold * (numInstances() / numFolds) + offset;
    copyInstances(0, train, first);
    copyInstances(first + numInstForFold, train,
              numInstances() - first - numInstForFold);

    return train;
  }

  /**
   * Creates the training set for one fold of a cross-validation 
   * on the dataset. The data is subsequently randomized based
   * on the given random number generator.
   *
   * @param numFolds the number of folds in the cross-validation. Must
   * be greater than 1.
   * @param numFold 0 for the first fold, 1 for the second, ...
   * @param random the random number generator
   * @return the training set 
   * @throws IllegalArgumentException if the number of folds is less than 2
   * or greater than the number of instances.
   */
  //@ requires 2 <= numFolds && numFolds < numInstances();
  //@ requires 0 <= numFold && numFold < numFolds;
01417   public Instances trainCV(int numFolds, int numFold, Random random) {

    Instances train = trainCV(numFolds, numFold);
    train.randomize(random);
    return train;
  }

  /**
   * Computes the variance for a numeric attribute.
   *
   * @param attIndex the numeric attribute (index starts with 0)
   * @return the variance if the attribute is numeric
   * @throws IllegalArgumentException if the attribute is not numeric
   */
01431   public /*@pure@*/ double variance(int attIndex) {
  
    double sum = 0, sumSquared = 0, sumOfWeights = 0;

    if (!attribute(attIndex).isNumeric()) {
      throw new IllegalArgumentException("Can't compute variance because attribute is " +
                    "not numeric!");
    }
    for (int i = 0; i < numInstances(); i++) {
      if (!instance(i).isMissing(attIndex)) {
      sum += instance(i).weight() * 
        instance(i).value(attIndex);
      sumSquared += instance(i).weight() * 
        instance(i).value(attIndex) *
        instance(i).value(attIndex);
      sumOfWeights += instance(i).weight();
      }
    }
    if (sumOfWeights <= 1) {
      return 0;
    }
    double result = (sumSquared - (sum * sum / sumOfWeights)) / 
      (sumOfWeights - 1);

    // We don't like negative variance
    if (result < 0) {
      return 0;
    } else {
      return result;
    }
  }

  /**
   * Computes the variance for a numeric attribute.
   *
   * @param att the numeric attribute
   * @return the variance if the attribute is numeric
   * @throws IllegalArgumentException if the attribute is not numeric
   */
01470   public /*@pure@*/ double variance(Attribute att) {
    
    return variance(att.index());
  }
  
  /**
   * Calculates summary statistics on the values that appear in this
   * set of instances for a specified attribute.
   *
   * @param index the index of the attribute to summarize (index starts with 0)
   * @return an AttributeStats object with it's fields calculated.
   */
  //@ requires 0 <= index && index < numAttributes();
01483   public AttributeStats attributeStats(int index) {

    AttributeStats result = new AttributeStats();
    if (attribute(index).isNominal()) {
      result.nominalCounts = new int [attribute(index).numValues()];
    }
    if (attribute(index).isNumeric()) {
      result.numericStats = new weka.experiment.Stats();
    }
    result.totalCount = numInstances();

    double [] attVals = attributeToDoubleArray(index);
    int [] sorted = Utils.sort(attVals);
    int currentCount = 0;
    double prev = Instance.missingValue();
    for (int j = 0; j < numInstances(); j++) {
      Instance current = instance(sorted[j]);
      if (current.isMissing(index)) {
      result.missingCount = numInstances() - j;
      break;
      }
      if (current.value(index) == prev) {
      currentCount++;
      } else {
      result.addDistinct(prev, currentCount);
      currentCount = 1;
      prev = current.value(index);
      }
    }
    result.addDistinct(prev, currentCount);
    result.distinctCount--; // So we don't count "missing" as a value 
    return result;
  }
  
  /**
   * Gets the value of all instances in this dataset for a particular
   * attribute. Useful in conjunction with Utils.sort to allow iterating
   * through the dataset in sorted order for some attribute.
   *
   * @param index the index of the attribute.
   * @return an array containing the value of the desired attribute for
   * each instance in the dataset. 
   */
  //@ requires 0 <= index && index < numAttributes();
01527   public /*@pure@*/ double [] attributeToDoubleArray(int index) {

    double [] result = new double[numInstances()];
    for (int i = 0; i < result.length; i++) {
      result[i] = instance(i).value(index);
    }
    return result;
  }

  /**
   * Generates a string summarizing the set of instances. Gives a breakdown
   * for each attribute indicating the number of missing/discrete/unique
   * values and other information.
   *
   * @return a string summarizing the dataset
   */
01543   public String toSummaryString() {

    StringBuffer result = new StringBuffer();
    result.append("Relation Name:  ").append(relationName()).append('\n');
    result.append("Num Instances:  ").append(numInstances()).append('\n');
    result.append("Num Attributes: ").append(numAttributes()).append('\n');
    result.append('\n');

    result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25));
    result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5));
    result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5));
    result.append(Utils.padLeft("Missing", 12));
    result.append(Utils.padLeft("Unique", 12));
    result.append(Utils.padLeft("Dist", 6)).append('\n');
    for (int i = 0; i < numAttributes(); i++) {
      Attribute a = attribute(i);
      AttributeStats as = attributeStats(i);
      result.append(Utils.padLeft("" + (i + 1), 4)).append(' ');
      result.append(Utils.padRight(a.name(), 25)).append(' ');
      long percent;
      switch (a.type()) {
      case Attribute.NOMINAL:
      result.append(Utils.padLeft("Nom", 4)).append(' ');
      percent = Math.round(100.0 * as.intCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + 0, 3)).append("% ");
      percent = Math.round(100.0 * as.realCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      break;
      case Attribute.NUMERIC:
      result.append(Utils.padLeft("Num", 4)).append(' ');
      result.append(Utils.padLeft("" + 0, 3)).append("% ");
      percent = Math.round(100.0 * as.intCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      percent = Math.round(100.0 * as.realCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      break;
      case Attribute.DATE:
      result.append(Utils.padLeft("Dat", 4)).append(' ');
      result.append(Utils.padLeft("" + 0, 3)).append("% ");
      percent = Math.round(100.0 * as.intCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      percent = Math.round(100.0 * as.realCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      break;
      case Attribute.STRING:
      result.append(Utils.padLeft("Str", 4)).append(' ');
      percent = Math.round(100.0 * as.intCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + 0, 3)).append("% ");
      percent = Math.round(100.0 * as.realCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      break;
      case Attribute.RELATIONAL:
      result.append(Utils.padLeft("Rel", 4)).append(' ');
      percent = Math.round(100.0 * as.intCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + 0, 3)).append("% ");
      percent = Math.round(100.0 * as.realCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      break;
      default:
      result.append(Utils.padLeft("???", 4)).append(' ');
      result.append(Utils.padLeft("" + 0, 3)).append("% ");
      percent = Math.round(100.0 * as.intCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      percent = Math.round(100.0 * as.realCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      break;
      }
      result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /");
      percent = Math.round(100.0 * as.missingCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /");
      percent = Math.round(100.0 * as.uniqueCount / as.totalCount);
      result.append(Utils.padLeft("" + percent, 3)).append("% ");
      result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' ');
      result.append('\n');
    }
    return result.toString();
  }

  /**
   * Copies instances from one set to the end of another 
   * one.
   *
   * @param from the position of the first instance to be copied
   * @param dest the destination for the instances
   * @param num the number of instances to be copied
   */
  //@ requires 0 <= from && from <= numInstances() - num;
  //@ requires 0 <= num;
01635   protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) {
    
    for (int i = 0; i < num; i++) {
      dest.add(instance(from + i));
    }
  }
  
  /**
   * Replaces the attribute information by a clone of
   * itself.
   */
01646   protected void freshAttributeInfo() {

    m_Attributes = (FastVector) m_Attributes.copyElements();
  }
 
  /**
   * Returns string including all instances, their weights and
   * their indices in the original dataset.
   *
   * @return description of instance and its weight as a string
   */
01657   protected /*@pure@*/ String instancesAndWeights(){

    StringBuffer text = new StringBuffer();

    for (int i = 0; i < numInstances(); i++) {
      text.append(instance(i) + " " + instance(i).weight());
      if (i < numInstances() - 1) {
      text.append("\n");
      }
    }
    return text.toString();
  }
  
  /**
   * Partitions the instances around a pivot. Used by quicksort and
   * kthSmallestValue.
   *
   * @param attIndex the attribute's index (index starts with 0)
   * @param l the first index of the subset (index starts with 0)
   * @param r the last index of the subset (index starts with 0)
   *
   * @return the index of the middle element
   */
  //@ requires 0 <= attIndex && attIndex < numAttributes();
  //@ requires 0 <= left && left <= right && right < numInstances();
01682   protected int partition(int attIndex, int l, int r) {
    
    double pivot = instance((l + r) / 2).value(attIndex);

    while (l < r) {
      while ((instance(l).value(attIndex) < pivot) && (l < r)) {
        l++;
      }
      while ((instance(r).value(attIndex) > pivot) && (l < r)) {
        r--;
      }
      if (l < r) {
        swap(l, r);
        l++;
        r--;
      }
    }
    if ((l == r) && (instance(r).value(attIndex) > pivot)) {
      r--;
    } 

    return r;
  }
  
  /**
   * Implements quicksort according to Manber's "Introduction to
   * Algorithms".
   *
   * @param attIndex the attribute's index (index starts with 0)
   * @param left the first index of the subset to be sorted (index starts with 0)
   * @param right the last index of the subset to be sorted (index starts with 0)
   */
  //@ requires 0 <= attIndex && attIndex < numAttributes();
  //@ requires 0 <= first && first <= right && right < numInstances();
01716   protected void quickSort(int attIndex, int left, int right) {

    if (left < right) {
      int middle = partition(attIndex, left, right);
      quickSort(attIndex, left, middle);
      quickSort(attIndex, middle + 1, right);
    }
  }
  
  /**
   * Implements computation of the kth-smallest element according
   * to Manber's "Introduction to Algorithms".
   *
   * @param attIndex the attribute's index (index starts with 0)
   * @param left the first index of the subset (index starts with 0)
   * @param right the last index of the subset (index starts with 0)
   * @param k the value of k
   *
   * @return the index of the kth-smallest element
   */
  //@ requires 0 <= attIndex && attIndex < numAttributes();
  //@ requires 0 <= first && first <= right && right < numInstances();
01738   protected int select(int attIndex, int left, int right, int k) {
    
    if (left == right) {
      return left;
    } else {
      int middle = partition(attIndex, left, right);
      if ((middle - left + 1) >= k) {
        return select(attIndex, left, middle, k);
      } else {
        return select(attIndex, middle + 1, right, k - (middle - left + 1));
      }
    }
  }

  /**
   * Help function needed for stratification of set.
   *
   * @param numFolds the number of folds for the stratification
   */
01757   protected void stratStep (int numFolds){
    
    FastVector newVec = new FastVector(m_Instances.capacity());
    int start = 0, j;

    // create stratified batch
    while (newVec.size() < numInstances()) {
      j = start;
      while (j < numInstances()) {
      newVec.addElement(instance(j));
      j = j + numFolds;
      }
      start++;
    }
    m_Instances = newVec;
  }
  
  /**
   * Swaps two instances in the set.
   *
   * @param i the first instance's index (index starts with 0)
   * @param j the second instance's index (index starts with 0)
   */
  //@ requires 0 <= i && i < numInstances();
  //@ requires 0 <= j && j < numInstances();
01782   public void swap(int i, int j){
    
    m_Instances.swap(i, j);
  }

  /**
   * Merges two sets of Instances together. The resulting set will have
   * all the attributes of the first set plus all the attributes of the 
   * second set. The number of instances in both sets must be the same.
   *
   * @param first the first set of Instances
   * @param second the second set of Instances
   * @return the merged set of Instances
   * @throws IllegalArgumentException if the datasets are not the same size
   */
01797   public static Instances mergeInstances(Instances first, Instances second) {

    if (first.numInstances() != second.numInstances()) {
      throw new IllegalArgumentException("Instance sets must be of the same size");
    }

    // Create the vector of merged attributes
    FastVector newAttributes = new FastVector();
    for (int i = 0; i < first.numAttributes(); i++) {
      newAttributes.addElement(first.attribute(i));
    }
    for (int i = 0; i < second.numAttributes(); i++) {
      newAttributes.addElement(second.attribute(i));
    }
    
    // Create the set of Instances
    Instances merged = new Instances(first.relationName() + '_'
                             + second.relationName(), 
                             newAttributes, 
                             first.numInstances());
    // Merge each instance
    for (int i = 0; i < first.numInstances(); i++) {
      merged.add(first.instance(i).mergeInstance(second.instance(i)));
    }
    return merged;
  }

  /**
   * Method for testing this class.
   *
   * @param argv should contain one element: the name of an ARFF file
   */
  //@ requires argv != null;
  //@ requires argv.length == 1;
  //@ requires argv[0] != null;
01832   public static void test(String [] argv) {

    Instances instances, secondInstances, train, test, empty;
    Random random = new Random(2);
    Reader reader;
    int start, num;
    FastVector testAtts, testVals;
    int i,j;
    
    try{
      if (argv.length > 1) {
      throw (new Exception("Usage: Instances [<filename>]"));
      }
      
      // Creating set of instances from scratch
      testVals = new FastVector(2);
      testVals.addElement("first_value");
      testVals.addElement("second_value");
      testAtts = new FastVector(2);
      testAtts.addElement(new Attribute("nominal_attribute", testVals));
      testAtts.addElement(new Attribute("numeric_attribute"));
      instances = new Instances("test_set", testAtts, 10);
      instances.add(new Instance(instances.numAttributes()));
      instances.add(new Instance(instances.numAttributes()));
      instances.add(new Instance(instances.numAttributes()));
      instances.setClassIndex(0);
      System.out.println("\nSet of instances created from scratch:\n");
      System.out.println(instances);
      
      if (argv.length == 1) {
      String filename = argv[0];
      reader = new FileReader(filename);
      
      // Read first five instances and print them
      System.out.println("\nFirst five instances from file:\n");
      instances = new Instances(reader, 1);
      instances.setClassIndex(instances.numAttributes() - 1);
      i = 0;
      while ((i < 5) && (instances.readInstance(reader))) {
        i++;
      }
      System.out.println(instances);

      // Read all the instances in the file
      reader = new FileReader(filename);
      instances = new Instances(reader);

      // Make the last attribute be the class 
      instances.setClassIndex(instances.numAttributes() - 1);
      
      // Print header and instances.
      System.out.println("\nDataset:\n");
      System.out.println(instances);
      System.out.println("\nClass index: "+instances.classIndex());
      }
      
      // Test basic methods based on class index.
      System.out.println("\nClass name: "+instances.classAttribute().name());
      System.out.println("\nClass index: "+instances.classIndex());
      System.out.println("\nClass is nominal: " +
                   instances.classAttribute().isNominal());
      System.out.println("\nClass is numeric: " +
                   instances.classAttribute().isNumeric());
      System.out.println("\nClasses:\n");
      for (i = 0; i < instances.numClasses(); i++) {
      System.out.println(instances.classAttribute().value(i));
      }
      System.out.println("\nClass values and labels of instances:\n");
      for (i = 0; i < instances.numInstances(); i++) {
      Instance inst = instances.instance(i);
      System.out.print(inst.classValue() + "\t");
      System.out.print(inst.toString(inst.classIndex()));
      if (instances.instance(i).classIsMissing()) {
        System.out.println("\tis missing");
      } else {
        System.out.println();
      }
      }
      
      // Create random weights.
      System.out.println("\nCreating random weights for instances.");
      for (i = 0; i < instances.numInstances(); i++) {
      instances.instance(i).setWeight(random.nextDouble()); 
      }
      
      // Print all instances and their weights (and the sum of weights).
      System.out.println("\nInstances and their weights:\n");
      System.out.println(instances.instancesAndWeights());
      System.out.print("\nSum of weights: ");
      System.out.println(instances.sumOfWeights());
      
      // Insert an attribute
      secondInstances = new Instances(instances);
      Attribute testAtt = new Attribute("Inserted");
      secondInstances.insertAttributeAt(testAtt, 0);
      System.out.println("\nSet with inserted attribute:\n");
      System.out.println(secondInstances);
      System.out.println("\nClass name: "
                   + secondInstances.classAttribute().name());
      
      // Delete the attribute
      secondInstances.deleteAttributeAt(0);
      System.out.println("\nSet with attribute deleted:\n");
      System.out.println(secondInstances);
      System.out.println("\nClass name: "
                   + secondInstances.classAttribute().name());
      
      // Test if headers are equal
      System.out.println("\nHeaders equal: "+
                   instances.equalHeaders(secondInstances) + "\n");
      
      // Print data in internal format.
      System.out.println("\nData (internal values):\n");
      for (i = 0; i < instances.numInstances(); i++) {
      for (j = 0; j < instances.numAttributes(); j++) {
        if (instances.instance(i).isMissing(j)) {
          System.out.print("? ");
        } else {
          System.out.print(instances.instance(i).value(j) + " ");
        }
      }
      System.out.println();
      }
      
      // Just print header
      System.out.println("\nEmpty dataset:\n");
      empty = new Instances(instances, 0);
      System.out.println(empty);
      System.out.println("\nClass name: "+empty.classAttribute().name());

      // Create copy and rename an attribute and a value (if possible)
      if (empty.classAttribute().isNominal()) {
      Instances copy = new Instances(empty, 0);
      copy.renameAttribute(copy.classAttribute(), "new_name");
      copy.renameAttributeValue(copy.classAttribute(), 
                          copy.classAttribute().value(0), 
                          "new_val_name");
      System.out.println("\nDataset with names changed:\n" + copy);
      System.out.println("\nOriginal dataset:\n" + empty);
      }

      // Create and prints subset of instances.
      start = instances.numInstances() / 4;
      num = instances.numInstances() / 2;
      System.out.print("\nSubset of dataset: ");
      System.out.println(num + " instances from " + (start + 1) 
                   + ". instance");
      secondInstances = new Instances(instances, start, num);
      System.out.println("\nClass name: "
                   + secondInstances.classAttribute().name());

      // Print all instances and their weights (and the sum of weights).
      System.out.println("\nInstances and their weights:\n");
      System.out.println(secondInstances.instancesAndWeights());
      System.out.print("\nSum of weights: ");
      System.out.println(secondInstances.sumOfWeights());
      
      // Create and print training and test sets for 3-fold
      // cross-validation.
      System.out.println("\nTrain and test folds for 3-fold CV:");
      if (instances.classAttribute().isNominal()) {
      instances.stratify(3);
      }
      for (j = 0; j < 3; j++) {
        train = instances.trainCV(3,j, new Random(1));
      test = instances.testCV(3,j);
                      
      // Print all instances and their weights (and the sum of weights).
      System.out.println("\nTrain: ");
      System.out.println("\nInstances and their weights:\n");
      System.out.println(train.instancesAndWeights());
      System.out.print("\nSum of weights: ");
      System.out.println(train.sumOfWeights());
      System.out.println("\nClass name: "+train.classAttribute().name());
      System.out.println("\nTest: ");
      System.out.println("\nInstances and their weights:\n");
      System.out.println(test.instancesAndWeights());
      System.out.print("\nSum of weights: ");
      System.out.println(test.sumOfWeights());
      System.out.println("\nClass name: "+test.classAttribute().name());
      }

      // Randomize instances and print them.
      System.out.println("\nRandomized dataset:");
      instances.randomize(random);
      
      // Print all instances and their weights (and the sum of weights).
      System.out.println("\nInstances and their weights:\n");
      System.out.println(instances.instancesAndWeights());
      System.out.print("\nSum of weights: ");
      System.out.println(instances.sumOfWeights());

      // Sort instances according to first attribute and
      // print them.
      System.out.print("\nInstances sorted according to first attribute:\n ");
      instances.sort(0);
        
      // Print all instances and their weights (and the sum of weights).
      System.out.println("\nInstances and their weights:\n");
      System.out.println(instances.instancesAndWeights());
      System.out.print("\nSum of weights: ");
      System.out.println(instances.sumOfWeights());
    } catch (Exception e) {
      e.printStackTrace(); 
    }
  }

  /**
   * Main method for this class. The following calls are possible:
   * <ul>
   *   <li>
   *     <code>weka.core.Instances</code> help<br/>
   *     prints a short list of possible commands.
   *   </li>
   *   <li>
   *     <code>weka.core.Instances</code> &lt;filename&gt;<br/>
   *     prints a summary of a set of instances.
   *   </li>
   *   <li>
   *     <code>weka.core.Instances</code> merge &lt;filename1&gt; &lt;filename2&gt;<br/>
   *     merges the two datasets (must have same number of instances) and
   *     outputs the results on stdout.
   *   </li>
   *   <li>
   *     <code>weka.core.Instances</code> append &lt;filename1&gt; &lt;filename2&gt;<br/>
   *     appends the second dataset to the first one (must have same headers) and
   *     outputs the results on stdout.
   *   </li>
   *   <li>
   *     <code>weka.core.Instances</code> randomize &lt;seed&gt; &lt;filename&gt;<br/>
   *     randomizes the dataset with the given seed and outputs the result on stdout.
   *   </li>
   * </ul>
   *
   * @param args  the commandline parameters
   */
02068   public static void main(String[] args) {

    try {
      Instances i;
      // read from stdin and print statistics
      if (args.length == 0) {
      DataSource source = new DataSource(System.in);
      i = source.getDataSet();
      System.out.println(i.toSummaryString());
      }
      // read file and print statistics
      else if ((args.length == 1) && (!args[0].equals("-h")) && (!args[0].equals("help"))) {
      DataSource source = new DataSource(args[0]);
      i = source.getDataSet();
      System.out.println(i.toSummaryString());
      }
      // read two files, merge them and print result to stdout
      else if ((args.length == 3) && (args[0].toLowerCase().equals("merge"))) {
      DataSource source1 = new DataSource(args[1]);
      DataSource source2 = new DataSource(args[2]);
      i = Instances.mergeInstances(source1.getDataSet(), source2.getDataSet());
      System.out.println(i);
      }
      // read two files, append them and print result to stdout
      else if ((args.length == 3) && (args[0].toLowerCase().equals("append"))) {
      DataSource source1 = new DataSource(args[1]);
      DataSource source2 = new DataSource(args[2]);
      if (!source1.getStructure().equalHeaders(source2.getStructure()))
        throw new Exception("The two datasets have different headers!");
      Instances structure = source1.getStructure();
      System.out.println(source1.getStructure());
      while (source1.hasMoreElements(structure))
        System.out.println(source1.nextElement(structure));
      structure = source2.getStructure();
      while (source2.hasMoreElements(structure))
        System.out.println(source2.nextElement(structure));
      }
      // read file and seed value, randomize data and print result to stdout
      else if ((args.length == 3) && (args[0].toLowerCase().equals("randomize"))) {
      DataSource source = new DataSource(args[2]);
      i = source.getDataSet();
      i.randomize(new Random(Integer.parseInt(args[1])));
      System.out.println(i);
      }
      // wrong parameters
      else {
      System.err.println(
          "\nUsage:\n"
          + "\tweka.core.Instances help\n"
          + "\tweka.core.Instances <filename>\n"
          + "\tweka.core.Instances merge <filename1> <filename2>\n"
          + "\tweka.core.Instances append <filename1> <filename2>\n"
          + "\tweka.core.Instances randomize <seed> <filename>\n"
      );
      System.exit(1);
      }
    }
    catch (Exception ex) {
      ex.printStackTrace();
      System.err.println(ex.getMessage());
    }
  }
  
  /**
   * Returns the revision string.
   * 
   * @return            the revision
   */
02136   public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.76 $");
  }
}

Generated by  Doxygen 1.6.0   Back to index