Skip to content

Commit

Permalink
Merge pull request #658 from karlnapf/master
Browse files Browse the repository at this point in the history
cross-validation for combined features aka MKL
  • Loading branch information
karlnapf committed Jul 18, 2012
2 parents 46b4d2c + 4f86bd8 commit 01d6292
Show file tree
Hide file tree
Showing 14 changed files with 221 additions and 20 deletions.
1 change: 1 addition & 0 deletions src/interfaces/modular/Features.i
Expand Up @@ -20,6 +20,7 @@
/* These functions return new Objects */
%newobject get_transposed();
%newobject create_merged_copy(CFeatures* other);
%newobject copy_subset(SGVector<index_t> indices);

#ifdef USE_SWIG_DIRECTORS
%feature("director") shogun::CDirectorDotFeatures;
Expand Down
6 changes: 5 additions & 1 deletion src/shogun/classifier/mkl/MKL.cpp
Expand Up @@ -196,7 +196,11 @@ bool CMKL::train_machine(CFeatures* data)
if (data)
{
if (m_labels->get_num_labels() != data->get_num_vectors())
SG_ERROR("Number of training vectors does not match number of labels\n");
{
SG_ERROR("%s::train_machine(): Number of training vectors (%d) does"
" not match number of labels (%d)\n", get_name(),
data->get_num_vectors(), m_labels->get_num_labels());
}
kernel->init(data, data);
}

Expand Down
7 changes: 5 additions & 2 deletions src/shogun/classifier/mkl/MKLMulticlass.cpp
Expand Up @@ -329,8 +329,11 @@ bool CMKLMulticlass::train_machine(CFeatures* data)
if (data)
{
if (m_labels->get_num_labels() != data->get_num_vectors())
SG_ERROR("Number of training vectors does not match number of "
"labels\n");
{
SG_ERROR("%s::train_machine(): Number of training vectors (%d) does"
" not match number of labels (%d)\n", get_name(),
data->get_num_vectors(), m_labels->get_num_labels());
}
m_kernel->init(data, data);
}

Expand Down
6 changes: 5 additions & 1 deletion src/shogun/classifier/svm/CPLEXSVM.cpp
Expand Up @@ -39,7 +39,11 @@ bool CCPLEXSVM::train_machine(CFeatures* data)
if (data)
{
if (m_labels->get_num_labels() != data->get_num_vectors())
SG_ERROR("Number of training vectors does not match number of labels\n");
{
SG_ERROR("%s::train_machine(): Number of training vectors (%d) does"
" not match number of labels (%d)\n", get_name(),
data->get_num_vectors(), m_labels->get_num_labels());
}
kernel->init(data, data);
}

Expand Down
6 changes: 5 additions & 1 deletion src/shogun/classifier/svm/LibSVM.cpp
Expand Up @@ -40,7 +40,11 @@ bool CLibSVM::train_machine(CFeatures* data)
if (data)
{
if (m_labels->get_num_labels() != data->get_num_vectors())
SG_ERROR("Number of training vectors does not match number of labels\n");
{
SG_ERROR("%s::train_machine(): Number of training vectors (%d) does"
" not match number of labels (%d)\n", get_name(),
data->get_num_vectors(), m_labels->get_num_labels());
}
kernel->init(data, data);
}

Expand Down
6 changes: 5 additions & 1 deletion src/shogun/classifier/svm/SVMLight.cpp
Expand Up @@ -208,7 +208,11 @@ bool CSVMLight::train_machine(CFeatures* data)
if (data)
{
if (m_labels->get_num_labels() != data->get_num_vectors())
SG_ERROR("Number of training vectors does not match number of labels\n");
{
SG_ERROR("%s::train_machine(): Number of training vectors (%d) does"
" not match number of labels (%d)\n", get_name(),
data->get_num_vectors(), m_labels->get_num_labels());
}
kernel->init(data, data);
}

Expand Down
139 changes: 135 additions & 4 deletions src/shogun/features/CombinedFeatures.cpp
Expand Up @@ -6,11 +6,14 @@
*
* Written (W) 1999-2009 Soeren Sonnenburg
* Written (W) 1999-2008 Gunnar Raetsch
* Written (W) 2012 Heiko Strathmann
* Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
*/

#include <shogun/features/CombinedFeatures.h>
#include <shogun/io/SGIO.h>
#include <shogun/lib/Set.h>
#include <shogun/lib/Map.h>

using namespace shogun;

Expand Down Expand Up @@ -153,8 +156,11 @@ bool CCombinedFeatures::insert_feature_obj(CFeatures* obj)
ASSERT(obj);
int32_t n=obj->get_num_vectors();

if (num_vec>0 && n!=num_vec)
SG_ERROR("Number of feature vectors does not match (expected %d, obj has %d)\n", num_vec, n);
if (get_num_vectors()>0 && n!=get_num_vectors())
{
SG_ERROR("Number of feature vectors does not match (expected %d, "
"obj has %d)\n", get_num_vectors(), n);
}

num_vec=n;
return feature_list->insert_element(obj);
Expand All @@ -165,8 +171,11 @@ bool CCombinedFeatures::append_feature_obj(CFeatures* obj)
ASSERT(obj);
int32_t n=obj->get_num_vectors();

if (num_vec>0 && n!=num_vec)
SG_ERROR("Number of feature vectors does not match (expected %d, obj has %d)\n", num_vec, n);
if (get_num_vectors()>0 && n!=get_num_vectors())
{
SG_ERROR("Number of feature vectors does not match (expected %d, "
"obj has %d)\n", get_num_vectors(), n);
}

num_vec=n;
return feature_list->append_element(obj);
Expand Down Expand Up @@ -199,6 +208,10 @@ void CCombinedFeatures::init()

CFeatures* CCombinedFeatures::create_merged_copy(CFeatures* other)
{
/* TODO, if all features are the same, only one copy should be created
* in memory */
SG_WARNING("Heiko Strathmann: FIXME, unefficient!\n");

SG_DEBUG("entering %s::create_merged_copy()\n", get_name());
if (get_feature_type()!=other->get_feature_type() ||
get_feature_class()!=other->get_feature_class() ||
Expand Down Expand Up @@ -238,3 +251,121 @@ CFeatures* CCombinedFeatures::create_merged_copy(CFeatures* other)
SG_DEBUG("leaving %s::create_merged_copy()\n", get_name());
return result;
}

void CCombinedFeatures::add_subset(SGVector<index_t> subset)
{
SG_DEBUG("entering %s::add_subset()\n", get_name());
CSet<CFeatures*>* processed=new CSet<CFeatures*>();

CFeatures* current=get_first_feature_obj();
while (current)
{
if (!processed->contains(current))
{
/* remember that subset was added here */
current->add_subset(subset);
processed->add(current);
SG_DEBUG("adding subset to %s at %p\n",
current->get_name(), current);
}
SG_UNREF(current);
current=get_next_feature_obj();
}

/* also add subset to local stack to have it for easy access */
m_subset_stack->add_subset(subset);

subset_changed_post();
SG_UNREF(processed);
SG_DEBUG("leaving %s::add_subset()\n", get_name());
}

void CCombinedFeatures::remove_subset()
{
SG_DEBUG("entering %s::remove_subset()\n", get_name());
CSet<CFeatures*>* processed=new CSet<CFeatures*>();

CFeatures* current=get_first_feature_obj();
while (current)
{
if (!processed->contains(current))
{
/* remember that subset was added here */
current->remove_subset();
processed->add(current);
SG_DEBUG("removing subset from %s at %p\n",
current->get_name(), current);
}
SG_UNREF(current);
current=get_next_feature_obj();
}

/* also remove subset from local stack to have it for easy access */
m_subset_stack->remove_subset();

subset_changed_post();
SG_UNREF(processed);
SG_DEBUG("leaving %s::remove_subset()\n", get_name());
}

void CCombinedFeatures::remove_all_subsets()
{
SG_DEBUG("entering %s::remove_all_subsets()\n", get_name());
CSet<CFeatures*>* processed=new CSet<CFeatures*>();

CFeatures* current=get_first_feature_obj();
while (current)
{
if (!processed->contains(current))
{
/* remember that subset was added here */
current->remove_all_subsets();
processed->add(current);
SG_DEBUG("removing all subsets from %s at %p\n",
current->get_name(), current);
}
SG_UNREF(current);
current=get_next_feature_obj();
}

/* also remove subsets from local stack to have it for easy access */
m_subset_stack->remove_all_subsets();

subset_changed_post();
SG_UNREF(processed);
SG_DEBUG("leaving %s::remove_all_subsets()\n", get_name());
}

CFeatures* CCombinedFeatures::copy_subset(SGVector<index_t> indices)
{
/* this is returned with the results of copy_subset of sub-features */
CCombinedFeatures* result=new CCombinedFeatures();

/* map to only copy same feature objects once */
CMap<CFeatures*, CFeatures*>* processed=new CMap<CFeatures*, CFeatures*>();
CFeatures* current=get_first_feature_obj();
while (current)
{
CFeatures* new_element=NULL;

/* only copy if not done yet, otherwise, use old copy */
if (!processed->contains(current))
{
new_element=current->copy_subset(indices);
processed->add(current, new_element);
}
else
new_element=processed->get_element(current);

/* add to result */
result->append_feature_obj(new_element);

SG_UNREF(current);
current=get_next_feature_obj();
}

SG_UNREF(processed);

SG_REF(result);
return result;
}
41 changes: 40 additions & 1 deletion src/shogun/features/CombinedFeatures.h
Expand Up @@ -6,6 +6,7 @@
*
* Written (W) 1999-2009 Soeren Sonnenburg
* Written (W) 1999-2008 Gunnar Raetsch
* Written (W) 2012 Heiko Strathmann
* Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
*/

Expand All @@ -27,6 +28,11 @@ class CListElement;
* It keeps pointers to the added sub-features and is especially useful to
* combine kernels working on different domains (c.f. CCombinedKernel) and to
* combine kernels looking at independent features.
*
* Subsets are supported: All actions will just be given through to all
* sub-features. Only once per sub-feature instance, i.e. if there are two
* sub-features that are the same instance, the subset action will only be
* performed once.
*/
class CCombinedFeatures : public CFeatures
{
Expand Down Expand Up @@ -69,7 +75,8 @@ class CCombinedFeatures : public CFeatures
*/
inline virtual int32_t get_num_vectors() const
{
return num_vec;
return m_subset_stack->has_subsets()
? m_subset_stack->get_size() : num_vec;
}

/** get memory footprint of one feature
Expand Down Expand Up @@ -158,6 +165,38 @@ class CCombinedFeatures : public CFeatures
*/
CFeatures* create_merged_copy(CFeatures* other);

/** adds a subset of indices on top of the current subsets (possibly
* subset o subset. Calls subset_changed_post() afterwards.
* Adds the subset to all sub-features
*
* @param subset subset of indices to add
* */
virtual void add_subset(SGVector<index_t> subset);

/** removes that last added subset from subset stack, if existing
* Calls subset_changed_post() afterwards
*
* Removes the subset from all sub-features
* */
virtual void remove_subset();

/** removes all subsets
* Calls subset_changed_post() afterwards
*
* Removes all subsets of all sub-features
* */
virtual void remove_all_subsets();

/** Creates a new CFeatures instance containing copies of the elements
* which are specified by the provided indices.
* Simply creates a combined features instance where all sub-features
* are the results of their copy_subset calls
*
* @param indices indices of feature elements to copy
* @return new CFeatures instance with copies of feature data
*/
virtual CFeatures* copy_subset(SGVector<index_t> indices);

/** @return object name */
inline virtual const char* get_name() const { return "CombinedFeatures"; }

Expand Down
4 changes: 3 additions & 1 deletion src/shogun/features/DenseFeatures.cpp
Expand Up @@ -557,7 +557,9 @@ template<class ST> CFeatures* CDenseFeatures<ST>::copy_subset(SGVector<index_t>
num_features*sizeof(ST));
}

return new CDenseFeatures(feature_matrix_copy);
CFeatures* result=new CDenseFeatures(feature_matrix_copy);
SG_REF(result);
return result;
}

template<class ST> ST* CDenseFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len,
Expand Down
6 changes: 3 additions & 3 deletions src/shogun/features/Features.cpp
Expand Up @@ -368,8 +368,8 @@ void CFeatures::remove_all_subsets()

CFeatures* CFeatures::copy_subset(SGVector<index_t> indices)
{
SG_ERROR("copy_subset and therefore model storage of CMachine "
"(required for cross-validation and model-selection is ",
"not yet implemented for feature type %s\n", get_name());
SG_ERROR("%s::copy_subset(): copy_subset and therefore model storage of "
"CMachine (required for cross-validation and model-selection is "
"not yet implemented yet. Ask developers!\n", get_name());
return NULL;
}
4 changes: 3 additions & 1 deletion src/shogun/features/SparseFeatures.cpp
Expand Up @@ -1070,7 +1070,9 @@ template<class ST> CFeatures* CSparseFeatures<ST>::copy_subset(SGVector<index_t>
free_sparse_feature_vector(index);
}

return new CSparseFeatures<ST>(matrix_copy);
CFeatures* result=new CSparseFeatures<ST>(matrix_copy);
SG_REF(result);
return result;
}

template<class ST> SGSparseVectorEntry<ST>* CSparseFeatures<ST>::compute_sparse_feature_vector(int32_t num,
Expand Down
2 changes: 2 additions & 0 deletions src/shogun/features/StringFeatures.cpp
Expand Up @@ -1611,6 +1611,8 @@ template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(SGVector<index_t>
/* max string length may have changed */
result->determine_maximum_string_length();

SG_REF(result);

return result;
}

Expand Down
8 changes: 6 additions & 2 deletions src/shogun/features/SubsetStack.cpp
Expand Up @@ -54,15 +54,19 @@ void CSubsetStack::add_subset(SGVector<index_t> subset)
CSubset* latest=(CSubset*)m_active_subsets_stack->get_last_element();
if (subset.vlen>latest->m_subset_idx.vlen)
{
SG_ERROR("Error in %s::add_subset(): Provided index vector is "
subset.display_vector("subset");
latest->m_subset_idx.display_vector("last on stack");
SG_ERROR("%s::add_subset(): Provided index vector is "
"larger than the subsets on the stubset stack!\n", get_name());
}

/* check for range of indices */
index_t max_index=SGVector<index_t>::max(subset.vector, subset.vlen);
if (max_index>=latest->m_subset_idx.vlen)
{
SG_ERROR("Error in %s::add_subset(): Provided index vector contains"
subset.display_vector("subset");
latest->m_subset_idx.display_vector("last on stack");
SG_ERROR("%s::add_subset(): Provided index vector contains"
" indices larger than possible range!\n", get_name());
}

Expand Down
5 changes: 3 additions & 2 deletions src/shogun/multiclass/GMNPSVM.cpp
Expand Up @@ -58,9 +58,10 @@ bool CGMNPSVM::train_machine(CFeatures* data)

if (data)
{
if (data->get_num_vectors() != m_labels->get_num_labels())
if (m_labels->get_num_labels() != data->get_num_vectors())
{
SG_ERROR("Numbert of vectors (%d) does not match number of labels (%d)\n",
SG_ERROR("%s::train_machine(): Number of training vectors (%d) does"
" not match number of labels (%d)\n", get_name(),
data->get_num_vectors(), m_labels->get_num_labels());
}
m_kernel->init(data, data);
Expand Down

0 comments on commit 01d6292

Please sign in to comment.