Skip to content

Commit

Permalink
Merge pull request #467 from iglesias/repath-jlcovertree
Browse files Browse the repository at this point in the history
John Langford's Cover Tree Integration
  • Loading branch information
Soeren Sonnenburg committed Apr 18, 2012
2 parents a8231f2 + 44aa7bc commit 3d4d21f
Show file tree
Hide file tree
Showing 8 changed files with 1,383 additions and 222 deletions.
437 changes: 263 additions & 174 deletions src/shogun/classifier/KNN.cpp

Large diffs are not rendered by default.

65 changes: 17 additions & 48 deletions src/shogun/classifier/KNN.h
Expand Up @@ -19,14 +19,11 @@
#include <shogun/io/SGIO.h>
#include <shogun/features/Features.h>
#include <shogun/distance/Distance.h>
#include <shogun/lib/CoverTree.h>
#include <shogun/machine/DistanceMachine.h>

namespace shogun
{

class KNN_COVERTREE_POINT;

class CDistanceMachine;

/** @brief Class KNN, an implementation of the standard k-nearest neigbor
Expand Down Expand Up @@ -192,18 +189,23 @@ class CKNN : public CDistanceMachine
*/
virtual bool train_machine(CFeatures* data=NULL);

/** get the m_k nearest neighbors to the test vector indexed by "idx"
* these neighbors are found within the set of training vectors. This
* function makes use of cover tree support
*
* @param idx index of the test vector for which to find NN
* @return out vector with indices to the NN
*/
virtual void get_neighbors(int32_t* out, int32_t idx);

private:
void init();

/** compute the histogram of class outputs of the first k nearest
* neighbors to a test vector and return the index of the most
* frequent class
*
* @param classes vector used to store the histogram
* @param train_lab class indices of the training data. If the cover
* tree is not used, the elements are ordered by increasing distance
* and there are elements for each of the training vectors. If the cover
* tree is used, it contains just m_k elements not necessary ordered.
*
* @return index of the most frequent class, class detected by KNN
*/
int32_t choose_class(float64_t* classes, int32_t* train_lab);

protected:
/// the k parameter in KNN
int32_t m_k;
Expand All @@ -214,47 +216,14 @@ class CKNN : public CDistanceMachine
/// parameter to enable cover tree support
bool m_use_covertree;

/// parameter to tell if the cover tree has been built during training
bool m_built_covertree;

/// class member cover tree
CoverTree<KNN_COVERTREE_POINT>* m_covertree;

/// number of classes (i.e. number of values labels can take)
int32_t num_classes;
int32_t m_num_classes;

/// smallest label, i.e. -1
int32_t min_label;
int32_t m_min_label;

/** the actual trainlabels */
SGVector<int32_t> train_labels;
};

class KNN_COVERTREE_POINT
{
public:

KNN_COVERTREE_POINT(int32_t index, CDistance* knncp_distance)
{
m_point_index = index;
m_distance = knncp_distance;
}

inline double distance(const KNN_COVERTREE_POINT& p) const
{
return m_distance->distance(p.m_point_index, m_point_index);
}

inline bool operator==(const KNN_COVERTREE_POINT& p) const
{
return (p.m_point_index == m_point_index);
}

public:

int32_t m_point_index;

CDistance* m_distance;
SGVector<int32_t> m_train_labels;
};

}
Expand Down
21 changes: 21 additions & 0 deletions src/shogun/distance/Distance.cpp
Expand Up @@ -154,6 +154,27 @@ CFeatures* CDistance::replace_rhs(CFeatures* r)
return tmp;
}

CFeatures* CDistance::replace_lhs(CFeatures* l)
{
//make sure features were indeed supplied
ASSERT(l);

//make sure features are compatible
ASSERT(rhs->get_feature_class()==l->get_feature_class());
ASSERT(rhs->get_feature_type()==l->get_feature_type());

//remove references to previous rhs features
CFeatures* tmp=lhs;

lhs=l;

SG_FREE(precomputed_matrix);
precomputed_matrix=NULL ;

// return old features including reference count
return tmp;
}

void CDistance::do_precompute_matrix()
{
int32_t num_left=lhs->get_num_vectors();
Expand Down
29 changes: 29 additions & 0 deletions src/shogun/distance/Distance.h
Expand Up @@ -133,6 +133,25 @@ class CDistance : public CSGObject
return compute(idx_a, idx_b);
}

/** get distance function for lhs feature vector a
* and rhs feature vector b. The computation of the
* distance stops if the intermediate result is
* larger than upper_bound. This is useful to use
* with John Langford's Cover Tree and it is ONLY
* implemented for Euclidian distance
*
* @param idx_a feature vector a at idx_a
* @param idx_b feature vector b at idx_b
* @param upper_bound value above which the computation
* halts
* @return distance value or upper_bound
*/
virtual float64_t distance_upper_bounded(int32_t idx_a, int32_t idx_b, float64_t upper_bound)
{
return distance(idx_a, idx_b);
}


/** get distance matrix
*
* @return computed distance matrix (needs to be cleaned up)
Expand Down Expand Up @@ -210,6 +229,16 @@ class CDistance : public CSGObject
*/
CFeatures* replace_rhs(CFeatures* rhs);

/** replace left-hand side features used in distance matrix
*
* make sure to check that your distance can deal with the
* supplied features (!)
*
* @param lhs features of right-hand side
* @return replaced left-hand side features
*/
CFeatures* replace_lhs(CFeatures* rhs);

/** remove lhs and rhs from distance */
virtual void remove_lhs_and_rhs();

Expand Down
41 changes: 41 additions & 0 deletions src/shogun/distance/EuclidianDistance.cpp
Expand Up @@ -74,3 +74,44 @@ void CEuclidianDistance::init()

m_parameters->add(&disable_sqrt, "disable_sqrt", "If sqrt shall not be applied.");
}

float64_t CEuclidianDistance::distance_upper_bounded(int32_t idx_a, int32_t idx_b, float64_t upper_bound)
{
int32_t alen, blen;
bool afree, bfree;
float64_t result=0;

upper_bound *= upper_bound;

float64_t* avec=((CSimpleFeatures<float64_t>*) lhs)->
get_feature_vector(idx_a, alen, afree);
float64_t* bvec=((CSimpleFeatures<float64_t>*) rhs)->
get_feature_vector(idx_b, blen, bfree);
ASSERT(alen==blen);

for (int32_t i=0; i<alen; i++)
{
result+=CMath::sq(avec[i] - bvec[i]);

if (result > upper_bound)
{
((CSimpleFeatures<float64_t>*) lhs)->
free_feature_vector(avec, idx_a, afree);
((CSimpleFeatures<float64_t>*) rhs)->
free_feature_vector(bvec, idx_b, bfree);

if (disable_sqrt)
return result;
else
return CMath::sqrt(result);
}
}

((CSimpleFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
((CSimpleFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);

if (disable_sqrt)
return result;
else
return CMath::sqrt(result);
}
14 changes: 14 additions & 0 deletions src/shogun/distance/EuclidianDistance.h
Expand Up @@ -97,6 +97,20 @@ class CEuclidianDistance: public CRealDistance
*/
virtual void set_disable_sqrt(bool state) { disable_sqrt=state; };

/** compute the distance between lhs feature vector a
* and rhs feature vector b. The computation of the
* distance stops if the intermediate result is
* larger than upper_bound. This is useful to use
* with John Langford's Cover Tree
*
* @param idx_a feature vector a at idx_a
* @param idx_b feature vector b at idx_b
* @param upper_bound value above which the computation
* halts
* @return distance value or upper_bound
*/
virtual float64_t distance_upper_bounded(int32_t idx_a, int32_t idx_b, float64_t upper_bound);

protected:
/// compute kernel function for features a and b
/// idx_{a,b} denote the index of the feature vectors
Expand Down

0 comments on commit 3d4d21f

Please sign in to comment.