Skip to content

Commit

Permalink
Merge pull request #426 from iglesias/knn-covertree
Browse files Browse the repository at this point in the history
CoverTree integration for KNN
  • Loading branch information
Soeren Sonnenburg committed Apr 6, 2012
2 parents aa97c39 + fbc2f8f commit 8d1fd5d
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 41 deletions.
185 changes: 150 additions & 35 deletions src/shogun/classifier/KNN.cpp
Expand Up @@ -7,6 +7,7 @@
* Written (W) 2006 Christian Gehl
* Written (W) 2006-2009 Soeren Sonnenburg
* Written (W) 2011 Sergey Lisitsyn
* Written (W) 2012 Fernando José Iglesias García, cover tree support
* Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
*/

Expand Down Expand Up @@ -34,9 +35,9 @@ CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
ASSERT(d);
ASSERT(trainlab);

set_distance(d);
set_labels(trainlab);
train_labels.vlen=trainlab->get_num_labels();
set_distance(d);
set_labels(trainlab);
train_labels.vlen=trainlab->get_num_labels();
}

void CKNN::init()
Expand All @@ -47,16 +48,27 @@ void CKNN::init()

m_k=3;
m_q=1.0;
m_use_covertree=false;
num_classes=0;

m_parameters->add(&m_k, "k", "Parameter k");
m_parameters->add(&m_q, "q", "Parameter q");
m_parameters->add(&num_classes, "num_classes", "Number of classes");
m_covertree=NULL;
m_built_covertree=false;

/** TODO not really sure here if these two first guys should be MS_AVAILABLE or
* MS_NOT_AVAILABLE
*/
SG_ADD(&m_k, "m_k", "Parameter k", MS_AVAILABLE);
SG_ADD(&m_q, "m_q", "Parameter q", MS_AVAILABLE);
SG_ADD(&m_use_covertree, "m_use_covertree", "Parameter use_covertree", MS_NOT_AVAILABLE);
SG_ADD(&m_built_covertree, "m_built_covertree", "Parameter built_covertree", MS_NOT_AVAILABLE);
SG_ADD(&num_classes, "num_classes", "Number of classes", MS_NOT_AVAILABLE);
SG_ADD((CSGObject**) &m_covertree, "m_covertree", "Member cover tree", MS_NOT_AVAILABLE);
}

CKNN::~CKNN()
{
SG_FREE(train_labels.vector);
if ( m_use_covertree )
delete m_covertree;
}

bool CKNN::train_machine(CFeatures* data)
Expand All @@ -80,21 +92,54 @@ bool CKNN::train_machine(CFeatures* data)
int32_t max_class=train_labels.vector[0];
int32_t min_class=train_labels.vector[0];

int32_t i;
for (i=1; i<train_labels.vlen; i++)
for (int32_t i=1; i<train_labels.vlen; i++)
{
max_class=CMath::max(max_class, train_labels.vector[i]);
min_class=CMath::min(min_class, train_labels.vector[i]);
}

for (i=0; i<train_labels.vlen; i++)
for (int32_t i=0; i<train_labels.vlen; i++)
train_labels.vector[i]-=min_class;

min_label=min_class;
num_classes=max_class-min_class+1;

SG_INFO( "num_classes: %d (%+d to %+d) num_train: %d\n", num_classes,
min_class, max_class, train_labels.vlen);

// If cover tree is to be used, populate it with training vectors
// assuming that distance(train_vectors, train_vectors)
if ( m_use_covertree )
{
// Ensure that distance has the same features lhs and rhs
if ( ! distance->lhs_equals_rhs() )
{
SG_ERROR("Features lhs and rhs must be equal to train KNN "
"with covertree support\n");
}

// Look for the max distance among training vectors
float64_t max_dist = 0.0;
for (int32_t i=0; i<train_labels.vlen; i++)
{
for (int32_t j=i+1; j<train_labels.vlen; j++)
max_dist = CMath::max(max_dist, distance->distance(i, j));
}

// Create cover tree
m_covertree = new CoverTree<KNN_COVERTREE_POINT>(max_dist);

// Insert training vectors
for (int32_t i=0; i<train_labels.vlen; i++)
m_covertree->insert(KNN_COVERTREE_POINT(i, distance));

m_built_covertree = true;
}
else
{
m_built_covertree = false;
}

return true;
}

Expand All @@ -104,14 +149,30 @@ CLabels* CKNN::apply()
ASSERT(distance);
ASSERT(distance->get_num_vec_rhs());

if ( m_use_covertree && ! m_built_covertree )
SG_ERROR("The cover tree must have been built during training to use "
"it for classification\n");

int32_t num_lab=distance->get_num_vec_rhs();
ASSERT(m_k<=num_lab);

CLabels* output=new CLabels(num_lab);

float64_t* dists;
int32_t* train_lab;
//vector of neighbors used for the cover tree support
int32_t* nearest_neighbors;
//distances to train data and working buffer of train_labels
float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen);
int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
if ( ! m_use_covertree )
{
dists=SG_MALLOC(float64_t, train_labels.vlen);
train_lab=SG_MALLOC(int32_t, train_labels.vlen);
}
else
{
train_lab=SG_MALLOC(int32_t, m_k);
nearest_neighbors=SG_MALLOC(int32_t, m_k);
}

SG_INFO( "%d test examples\n", num_lab);
CSignal::clear_cancel();
Expand All @@ -123,23 +184,34 @@ CLabels* CKNN::apply()
{
SG_PROGRESS(i, 0, num_lab);

// lhs idx 1..n and rhs idx i
distances_lhs(dists,0,train_labels.vlen-1,i);
int32_t j;
if ( ! m_use_covertree )
{
//lhs idx 1..n and rhs idx i
distances_lhs(dists,0,train_labels.vlen-1,i);

for (j=0; j<train_labels.vlen; j++)
train_lab[j]=train_labels.vector[j];
for (int32_t j=0; j<train_labels.vlen; j++)
train_lab[j]=train_labels.vector[j];

//sort the distance vector for test example j to all train examples
//classes[1..k] then holds the classes for minimum distance
CMath::qsort_index(dists, train_lab, train_labels.vlen);
}
else
{
//get the k nearest neighbors to test vector i using the cover tree
get_neighbors(nearest_neighbors, i);

//sort the distance vector for test example j to all train examples
//classes[1..k] then holds the classes for minimum distance
CMath::qsort_index(dists, train_lab, train_labels.vlen);
//translate from indices to labels of the nearest neighbors
for (int32_t j=0; j<m_k; j++)
train_lab[j]=train_labels.vector[ nearest_neighbors[j] ];
}

//compute histogram of class outputs of the first k nearest neighbours
for (j=0; j<num_classes; j++)
for (int32_t j=0; j<num_classes; j++)
classes[j]=0.0;

float64_t multiplier = m_q;
for (j=0; j<m_k; j++)
for (int32_t j=0; j<m_k; j++)
{
classes[train_lab[j]]+= multiplier;
multiplier*= multiplier;
Expand All @@ -149,7 +221,7 @@ CLabels* CKNN::apply()
int32_t out_idx=0;
float64_t out_max=0;

for (j=0; j<num_classes; j++)
for (int32_t j=0; j<num_classes; j++)
{
if (out_max< classes[j])
{
Expand All @@ -161,8 +233,11 @@ CLabels* CKNN::apply()
}

SG_FREE(classes);
SG_FREE(dists);
SG_FREE(train_lab);
if ( ! m_use_covertree )
SG_FREE(dists);
else
SG_FREE(nearest_neighbors);

return output;
}
Expand Down Expand Up @@ -229,14 +304,30 @@ SGMatrix<int32_t> CKNN::classify_for_multiple_k()
ASSERT(distance);
ASSERT(distance->get_num_vec_rhs());

if ( m_use_covertree && ! m_built_covertree )
SG_ERROR("The cover tree must have been built during training to use "
"it for classification\n");

int32_t num_lab=distance->get_num_vec_rhs();
ASSERT(m_k<=num_lab);

int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);

float64_t* dists;
int32_t* train_lab;
//vector of neighbors used for the cover tree support
int32_t* nearest_neighbors;
//distances to train data and working buffer of train_labels
float64_t* dists=SG_MALLOC(float64_t, train_labels.vlen);
int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
if ( ! m_use_covertree )
{
dists=SG_MALLOC(float64_t, train_labels.vlen);
train_lab=SG_MALLOC(int32_t, train_labels.vlen);
}
else
{
train_lab=SG_MALLOC(int32_t, m_k);
nearest_neighbors=SG_MALLOC(int32_t, m_k);
}

///histogram of classes and returned output
int32_t* classes=SG_MALLOC(int32_t, num_classes);
Expand All @@ -248,14 +339,26 @@ SGMatrix<int32_t> CKNN::classify_for_multiple_k()
{
SG_PROGRESS(i, 0, num_lab);

// lhs idx 1..n and rhs idx i
distances_lhs(dists,0,train_labels.vlen-1,i);
for (int32_t j=0; j<train_labels.vlen; j++)
train_lab[j]=train_labels.vector[j];
if ( ! m_use_covertree )
{
// lhs idx 1..n and rhs idx i
distances_lhs(dists,0,train_labels.vlen-1,i);
for (int32_t j=0; j<train_labels.vlen; j++)
train_lab[j]=train_labels.vector[j];

//sort the distance vector for test example j to all train examples
//classes[1..k] then holds the classes for minimum distance
CMath::qsort_index(dists, train_lab, train_labels.vlen);
}
else
{
//get the k nearest neighbors to test vector i using the cover tree
get_neighbors(nearest_neighbors, i);

//sort the distance vector for test example j to all train examples
//classes[1..k] then holds the classes for minimum distance
CMath::qsort_index(dists, train_lab, train_labels.vlen);
//translate from indices to labels of the nearest neighbors
for (int32_t j=0; j<m_k; j++)
train_lab[j]=train_labels.vector[ nearest_neighbors[j] ];
}

//compute histogram of class outputs of the first k nearest neighbours
for (int32_t j=0; j<num_classes; j++)
Expand All @@ -281,11 +384,14 @@ SGMatrix<int32_t> CKNN::classify_for_multiple_k()
}
}

SG_FREE(dists);
SG_FREE(train_lab);
SG_FREE(classes);
if ( ! m_use_covertree )
SG_FREE(dists);
else
SG_FREE(nearest_neighbors);

return SGMatrix<int32_t>(output,num_lab,m_k);
return SGMatrix<int32_t>(output,num_lab,m_k,true);
}

void CKNN::init_distance(CFeatures* data)
Expand Down Expand Up @@ -316,6 +422,15 @@ bool CKNN::save(FILE* dstfile)
return false;
}

void CKNN::get_neighbors(int32_t* out, int32_t idx)
{
std::vector<KNN_COVERTREE_POINT> neighbors =
m_covertree->kNearestNeighbors(KNN_COVERTREE_POINT(idx, distance), m_k);

for (std::size_t m=0; m<unsigned(m_k); m++)
out[m] = neighbors[m].m_point_index;
}

void CKNN::store_model_features()
{
CFeatures* d_lhs=distance->get_lhs();
Expand Down

0 comments on commit 8d1fd5d

Please sign in to comment.