Logo Search packages:      
Sourcecode: madlib version File versions  Download package

kd_split.cpp

//----------------------------------------------------------------------
// File:                kd_split.cpp
// Programmer:          Sunil Arya and David Mount
// Description:         Methods for splitting kd-trees
// Last modified: 01/04/05 (Version 1.0)
//----------------------------------------------------------------------
// Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
// David Mount.  All Rights Reserved.
// 
// This software and related documentation is part of the Approximate
// Nearest Neighbor Library (ANN).  This software is provided under
// the provisions of the Lesser GNU Public License (LGPL).  See the
// file ../ReadMe.txt for further information.
// 
// The University of Maryland (U.M.) and the authors make no
// representations about the suitability or fitness of this software for
// any purpose.  It is provided "as is" without express or implied
// warranty.
//----------------------------------------------------------------------
// History:
//    Revision 0.1  03/04/98
//          Initial release
//    Revision 1.0  04/01/05
//----------------------------------------------------------------------

#include "kd_tree.h"                            // kd-tree definitions
#include "kd_util.h"                            // kd-tree utilities
#include "kd_split.h"                           // splitting functions

//----------------------------------------------------------------------
//    Constants
//----------------------------------------------------------------------

const double ERR = 0.001;                       // a small value
const double FS_ASPECT_RATIO = 3.0;       // maximum allowed aspect ratio
                                                            // in fair split. Must be >= 2.

//----------------------------------------------------------------------
//    kd_split - Bentley's standard splitting routine for kd-trees
//          Find the dimension of the greatest spread, and split
//          just before the median point along this dimension.
//----------------------------------------------------------------------

void kd_split(
      ANNpointArray           pa,                     // point array (permuted on return)
      ANNidxArray             pidx,             // point indices
      const ANNorthRect &bnds,                  // bounding rectangle for cell
      int                           n,                      // number of points
      int                           dim,              // dimension of space
      int                           &cut_dim,         // cutting dimension (returned)
      ANNcoord                &cut_val,         // cutting value (returned)
      int                           &n_lo)                  // num of points on low side (returned)
{
                                                            // find dimension of maximum spread
      cut_dim = annMaxSpread(pa, pidx, n, dim);
      n_lo = n/2;                                     // median rank
                                                            // split about median
      annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
}

//----------------------------------------------------------------------
//    midpt_split - midpoint splitting rule for box-decomposition trees
//
//          This is the simplest splitting rule that guarantees boxes
//          of bounded aspect ratio.  It simply cuts the box with the
//          longest side through its midpoint.  If there are ties, it
//          selects the dimension with the maximum point spread.
//
//          WARNING: This routine (while simple) doesn't seem to work
//          well in practice in high dimensions, because it tends to
//          generate a large number of trivial and/or unbalanced splits.
//          Either kd_split(), sl_midpt_split(), or fair_split() are
//          recommended, instead.
//----------------------------------------------------------------------

void midpt_split(
      ANNpointArray           pa,                     // point array
      ANNidxArray             pidx,             // point indices (permuted on return)
      const ANNorthRect &bnds,                  // bounding rectangle for cell
      int                           n,                      // number of points
      int                           dim,              // dimension of space
      int                           &cut_dim,         // cutting dimension (returned)
      ANNcoord                &cut_val,         // cutting value (returned)
      int                           &n_lo)                  // num of points on low side (returned)
{
      int d;

      ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
      for (d = 1; d < dim; d++) {               // find length of longest box side
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
            if (length > max_length) {
                  max_length = length;
            }
      }
      ANNcoord max_spread = -1;                 // find long side with most spread
      for (d = 0; d < dim; d++) {
                                                            // is it among longest?
            if (double(bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
                                                            // compute its spread
                  ANNcoord spr = annSpread(pa, pidx, n, d);
                  if (spr > max_spread) {       // is it max so far?
                        max_spread = spr;
                        cut_dim = d;
                  }
            }
      }
                                                            // split along cut_dim at midpoint
      cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim]) / 2;
                                                            // permute points accordingly
      int br1, br2;
      annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
      //------------------------------------------------------------------
      //    On return:        pa[0..br1-1] < cut_val
      //                            pa[br1..br2-1] == cut_val
      //                            pa[br2..n-1] > cut_val
      //
      //    We can set n_lo to any value in the range [br1..br2].
      //    We choose split so that points are most evenly divided.
      //------------------------------------------------------------------
      if (br1 > n/2) n_lo = br1;
      else if (br2 < n/2) n_lo = br2;
      else n_lo = n/2;
}

//----------------------------------------------------------------------
//    sl_midpt_split - sliding midpoint splitting rule
//
//          This is a modification of midpt_split, which has the nonsensical
//          name "sliding midpoint".  The idea is that we try to use the
//          midpoint rule, by bisecting the longest side.  If there are
//          ties, the dimension with the maximum spread is selected.  If,
//          however, the midpoint split produces a trivial split (no points
//          on one side of the splitting plane) then we slide the splitting
//          (maintaining its orientation) until it produces a nontrivial
//          split. For example, if the splitting plane is along the x-axis,
//          and all the data points have x-coordinate less than the x-bisector,
//          then the split is taken along the maximum x-coordinate of the
//          data points.
//
//          Intuitively, this rule cannot generate trivial splits, and
//          hence avoids midpt_split's tendency to produce trees with
//          a very large number of nodes.
//
//----------------------------------------------------------------------

void sl_midpt_split(
      ANNpointArray           pa,                     // point array
      ANNidxArray             pidx,             // point indices (permuted on return)
      const ANNorthRect &bnds,                  // bounding rectangle for cell
      int                           n,                      // number of points
      int                           dim,              // dimension of space
      int                           &cut_dim,         // cutting dimension (returned)
      ANNcoord                &cut_val,         // cutting value (returned)
      int                           &n_lo)                  // num of points on low side (returned)
{
      int d;

      ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
      for (d = 1; d < dim; d++) {               // find length of longest box side
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
            if (length > max_length) {
                  max_length = length;
            }
      }
      ANNcoord max_spread = -1;                 // find long side with most spread
      for (d = 0; d < dim; d++) {
                                                            // is it among longest?
            if ((bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
                                                            // compute its spread
                  ANNcoord spr = annSpread(pa, pidx, n, d);
                  if (spr > max_spread) {       // is it max so far?
                        max_spread = spr;
                        cut_dim = d;
                  }
            }
      }
                                                            // ideal split at midpoint
      ANNcoord ideal_cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim])/2;

      ANNcoord min, max;
      annMinMax(pa, pidx, n, cut_dim, min, max);      // find min/max coordinates

      if (ideal_cut_val < min)                  // slide to min or max as needed
            cut_val = min;
      else if (ideal_cut_val > max)
            cut_val = max;
      else
            cut_val = ideal_cut_val;

                                                            // permute points accordingly
      int br1, br2;
      annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
      //------------------------------------------------------------------
      //    On return:        pa[0..br1-1] < cut_val
      //                            pa[br1..br2-1] == cut_val
      //                            pa[br2..n-1] > cut_val
      //
      //    We can set n_lo to any value in the range [br1..br2] to satisfy
      //    the exit conditions of the procedure.
      //
      //    if ideal_cut_val < min (implying br2 >= 1),
      //                then we select n_lo = 1 (so there is one point on left) and
      //    if ideal_cut_val > max (implying br1 <= n-1),
      //                then we select n_lo = n-1 (so there is one point on right).
      //    Otherwise, we select n_lo as close to n/2 as possible within
      //                [br1..br2].
      //------------------------------------------------------------------
      if (ideal_cut_val < min) n_lo = 1;
      else if (ideal_cut_val > max) n_lo = n-1;
      else if (br1 > n/2) n_lo = br1;
      else if (br2 < n/2) n_lo = br2;
      else n_lo = n/2;
}

//----------------------------------------------------------------------
//    fair_split - fair-split splitting rule
//
//          This is a compromise between the kd-tree splitting rule (which
//          always splits data points at their median) and the midpoint
//          splitting rule (which always splits a box through its center.
//          The goal of this procedure is to achieve both nicely balanced
//          splits, and boxes of bounded aspect ratio.
//
//          A constant FS_ASPECT_RATIO is defined. Given a box, those sides
//          which can be split so that the ratio of the longest to shortest
//          side does not exceed ASPECT_RATIO are identified.  Among these
//          sides, we select the one in which the points have the largest
//          spread. We then split the points in a manner which most evenly
//          distributes the points on either side of the splitting plane,
//          subject to maintaining the bound on the ratio of long to short
//          sides. To determine that the aspect ratio will be preserved,
//          we determine the longest side (other than this side), and
//          determine how narrowly we can cut this side, without causing the
//          aspect ratio bound to be exceeded (small_piece).
//
//          This procedure is more robust than either kd_split or midpt_split,
//          but is more complicated as well.  When point distribution is
//          extremely skewed, this degenerates to midpt_split (actually
//          1/3 point split), and when the points are most evenly distributed,
//          this degenerates to kd-split.
//----------------------------------------------------------------------

void fair_split(
      ANNpointArray           pa,                     // point array
      ANNidxArray             pidx,             // point indices (permuted on return)
      const ANNorthRect &bnds,                  // bounding rectangle for cell
      int                           n,                      // number of points
      int                           dim,              // dimension of space
      int                           &cut_dim,         // cutting dimension (returned)
      ANNcoord                &cut_val,         // cutting value (returned)
      int                           &n_lo)                  // num of points on low side (returned)
{
      int d;
      ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
      cut_dim = 0;
      for (d = 1; d < dim; d++) {               // find length of longest box side
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
            if (length > max_length) {
                  max_length = length;
                  cut_dim = d;
            }
      }

      ANNcoord max_spread = 0;                  // find legal cut with max spread
      cut_dim = 0;
      for (d = 0; d < dim; d++) {
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
                                                            // is this side midpoint splitable
                                                            // without violating aspect ratio?
            if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
                                                            // compute spread along this dim
                  ANNcoord spr = annSpread(pa, pidx, n, d);
                  if (spr > max_spread) {       // best spread so far
                        max_spread = spr;
                        cut_dim = d;                  // this is dimension to cut
                  }
            }
      }

      max_length = 0;                                 // find longest side other than cut_dim
      for (d = 0; d < dim; d++) {
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
            if (d != cut_dim && length > max_length)
                  max_length = length;
      }
                                                            // consider most extreme splits
      ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
      ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
      ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut

      int br1, br2;
                                                            // is median below lo_cut ?
      if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
            cut_val = lo_cut;                   // cut at lo_cut
            annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
            n_lo = br1;
      }
                                                            // is median above hi_cut?
      else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
            cut_val = hi_cut;                   // cut at hi_cut
            annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
            n_lo = br2;
      }
      else {                                                // median cut preserves asp ratio
            n_lo = n/2;                               // split about median
            annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
      }
}

//----------------------------------------------------------------------
//    sl_fair_split - sliding fair split splitting rule
//
//          Sliding fair split is a splitting rule that combines the
//          strengths of both fair split with sliding midpoint split.
//          Fair split tends to produce balanced splits when the points
//          are roughly uniformly distributed, but it can produce many
//          trivial splits when points are highly clustered.  Sliding
//          midpoint never produces trivial splits, and shrinks boxes
//          nicely if points are highly clustered, but it may produce
//          rather unbalanced splits when points are unclustered but not
//          quite uniform.
//
//          Sliding fair split is based on the theory that there are two
//          types of splits that are "good": balanced splits that produce
//          fat boxes, and unbalanced splits provided the cell with fewer
//          points is fat.
//
//          This splitting rule operates by first computing the longest
//          side of the current bounding box.  Then it asks which sides
//          could be split (at the midpoint) and still satisfy the aspect
//          ratio bound with respect to this side.    Among these, it selects
//          the side with the largest spread (as fair split would).      It
//          then considers the most extreme cuts that would be allowed by
//          the aspect ratio bound.  This is done by dividing the longest
//          side of the box by the aspect ratio bound.      If the median cut
//          lies between these extreme cuts, then we use the median cut.
//          If not, then consider the extreme cut that is closer to the
//          median.      If all the points lie to one side of this cut, then
//          we slide the cut until it hits the first point.  This may
//          violate the aspect ratio bound, but will never generate empty
//          cells.      However the sibling of every such skinny cell is fat,
//          and hence packing arguments still apply.
//
//----------------------------------------------------------------------

void sl_fair_split(
      ANNpointArray           pa,                     // point array
      ANNidxArray             pidx,             // point indices (permuted on return)
      const ANNorthRect &bnds,                  // bounding rectangle for cell
      int                           n,                      // number of points
      int                           dim,              // dimension of space
      int                           &cut_dim,         // cutting dimension (returned)
      ANNcoord                &cut_val,         // cutting value (returned)
      int                           &n_lo)                  // num of points on low side (returned)
{
      int d;
      ANNcoord min, max;                              // min/max coordinates
      int br1, br2;                                   // split break points

      ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
      cut_dim = 0;
      for (d = 1; d < dim; d++) {               // find length of longest box side
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
            if (length  > max_length) {
                  max_length = length;
                  cut_dim = d;
            }
      }

      ANNcoord max_spread = 0;                  // find legal cut with max spread
      cut_dim = 0;
      for (d = 0; d < dim; d++) {
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
                                                            // is this side midpoint splitable
                                                            // without violating aspect ratio?
            if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
                                                            // compute spread along this dim
                  ANNcoord spr = annSpread(pa, pidx, n, d);
                  if (spr > max_spread) {       // best spread so far
                        max_spread = spr;
                        cut_dim = d;                  // this is dimension to cut
                  }
            }
      }

      max_length = 0;                                 // find longest side other than cut_dim
      for (d = 0; d < dim; d++) {
            ANNcoord length = bnds.hi[d] - bnds.lo[d];
            if (d != cut_dim && length > max_length)
                  max_length = length;
      }
                                                            // consider most extreme splits
      ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
      ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
      ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut
                                                            // find min and max along cut_dim
      annMinMax(pa, pidx, n, cut_dim, min, max);
                                                            // is median below lo_cut?
      if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
            if (max > lo_cut) {                       // are any points above lo_cut?
                  cut_val = lo_cut;             // cut at lo_cut
                  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
                  n_lo = br1;                         // balance if there are ties
            }
            else {                                          // all points below lo_cut
                  cut_val = max;                      // cut at max value
                  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
                  n_lo = n-1;
            }
      }
                                                            // is median above hi_cut?
      else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
            if (min < hi_cut) {                       // are any points below hi_cut?
                  cut_val = hi_cut;             // cut at hi_cut
                  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
                  n_lo = br2;                         // balance if there are ties
            }
            else {                                          // all points above hi_cut
                  cut_val = min;                      // cut at min value
                  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
                  n_lo = 1;
            }
      }
      else {                                                // median cut is good enough
            n_lo = n/2;                               // split about median
            annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
      }
}

Generated by  Doxygen 1.6.0   Back to index