Skip to content

Commit

Permalink
Merge pull request #428 from karlnapf/master
Browse files Browse the repository at this point in the history
subset stacks
  • Loading branch information
karlnapf committed Apr 8, 2012
2 parents 79a91ad + fdd6fd0 commit b7b8742
Show file tree
Hide file tree
Showing 21 changed files with 350 additions and 122 deletions.
Expand Up @@ -36,7 +36,7 @@ int main(int argc, char **argv)
CMath::display_vector(feature_subset.vector, feature_subset.vlen,
"feature subset");

f->set_subset(new CSubset(feature_subset));
f->add_subset(new CSubset(feature_subset));
SG_SPRINT("feature vectors after setting subset on original data:\n");
for (index_t i=0; i<f->get_num_vectors(); ++i)
{
Expand Down
Expand Up @@ -64,7 +64,7 @@ int main(int argc, char **argv)
"feature subset");

/* set subset and print data */
f->set_subset(new CSubset(feature_subset));
f->add_subset(new CSubset(feature_subset));
SG_SPRINT("feature vectors after setting subset on original data:\n");
for (index_t i=0; i<f->get_num_vectors(); ++i)
{
Expand Down
Expand Up @@ -63,7 +63,7 @@ int main(int argc, char **argv)
CMath::display_vector(feature_subset.vector, feature_subset.vlen,
"feature subset");

f->set_subset(new CSubset(feature_subset));
f->add_subset(new CSubset(feature_subset));
SG_SPRINT("feature vectors after setting subset on original data:\n");
for (index_t i=0; i<f->get_num_vectors(); ++i)
{
Expand Down
Expand Up @@ -80,7 +80,7 @@ int main(int argc, char **argv)
SG_SPRINT("\n\n-------------------\n"
"applying subset to features\n"
"-------------------\n");
features->set_subset(new CSubset(subset_idx));
features->add_subset(new CSubset(subset_idx));

/* do some stuff do check and output */
ASSERT(features->get_num_vectors()==num_subset_idx);
Expand Down Expand Up @@ -110,7 +110,7 @@ int main(int argc, char **argv)
SG_SPRINT("\n\n-------------------\n"
"removing subset from features\n"
"-------------------\n");
features->remove_subset();
features->remove_all_subsets();

/* do some stuff do check and output */
ASSERT(features->get_num_vectors()==num_vectors);
Expand Down Expand Up @@ -143,3 +143,4 @@ int main(int argc, char **argv)

return 0;
}

Expand Up @@ -75,7 +75,7 @@ int main(int argc, char **argv)
SG_SPRINT("\n-------------------\n"
"applying subset to features\n"
"-------------------\n");
features->set_subset(new CSubset(subset_idx));
features->add_subset(new CSubset(subset_idx));

/* do some stuff do check and output */
ASSERT(features->get_num_vectors()==num_subset_idx);
Expand Down Expand Up @@ -114,7 +114,7 @@ int main(int argc, char **argv)
SG_SPRINT("\n-------------------\n"
"removing subset from features\n"
"-------------------\n");
features->remove_subset();
features->remove_all_subsets();

/* do some stuff do check and output */
ASSERT(features->get_num_vectors()==num_vectors);
Expand Down
1 change: 1 addition & 0 deletions src/NEWS
Expand Up @@ -19,6 +19,7 @@
KernelMachines now make use of that by not recomputing kernel matrix in cross-validation.
- Cross-validation for KernelMachines is now parallelized.
- Cross-validation is now possible with custom kernels.
- Features may now have arbritarily many index subsets (of subsets (of subsets (...)))
* Bugfixes:
- Fix for bug in the Gaussian Naive Bayes classifier, its domain was
changed to log-space.
Expand Down
35 changes: 15 additions & 20 deletions src/shogun/evaluation/CrossValidation.cpp
Expand Up @@ -272,42 +272,37 @@ float64_t CCrossValidation::evaluate_one_run()
for (index_t i=0; i <num_subsets; ++i)
{
/* set feature subset for training */
SGVector<index_t> inverse_subset_indices =
SGVector<index_t> inverse_subset_indices=
m_splitting_strategy->generate_subset_inverse(i);
m_features->set_subset(new CSubset(inverse_subset_indices));
CSubset* training_subset=new CSubset(inverse_subset_indices);
m_features->add_subset(training_subset);

/* set label subset for training (copy data before) */
SGVector<index_t> inverse_subset_indices_copy(
inverse_subset_indices.vlen);
memcpy(inverse_subset_indices_copy.vector,
inverse_subset_indices.vector,
inverse_subset_indices.vlen * sizeof(index_t));
m_labels->set_subset(new CSubset(inverse_subset_indices_copy));
/* set label subset for training */
m_labels->set_subset(training_subset);

/* train machine on training features */
/* train machine on training features and remove subset */
m_machine->train(m_features);
m_features->remove_subset();

/* set feature subset for testing (subset method that stores pointer) */
SGVector<index_t> subset_indices =
m_splitting_strategy->generate_subset_indices(i);
m_features->set_subset(new CSubset(subset_indices));
CSubset* test_subset=new CSubset(subset_indices);
m_features->add_subset(test_subset);

/* set label subset for testing */
m_labels->set_subset(test_subset);

/* apply machine to test features */
/* apply machine to test features and remove subset */
CLabels* result_labels=m_machine->apply(m_features);
m_features->remove_subset();
SG_REF(result_labels);

/* set label subset for testing (copy data before) */
SGVector<index_t> subset_indices_copy(subset_indices.vlen);
memcpy(subset_indices_copy.vector, subset_indices.vector,
subset_indices.vlen * sizeof(index_t));
m_labels->set_subset(new CSubset(subset_indices_copy));

/* evaluate */
results[i]=m_evaluation_criterion->evaluate(result_labels, m_labels);

/* clean up, reset subsets */
/* clean up, remove subsets */
SG_UNREF(result_labels);
m_features->remove_subset();
m_labels->remove_subset();
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/shogun/evaluation/SplittingStrategy.h
Expand Up @@ -80,7 +80,7 @@ class CSplittingStrategy: public CSGObject

/** Abstract method.
* Has to refill the elements of the m_subset_indices variable with concrete
* indices. Note that virtual const char* get_name() const = 0;CDynamicArray<index_t> instances for every subset are
* indices. Note that CDynamicArray<index_t> instances for every subset are
* created in the constructor of this class - they just have to be filled.
*/
virtual void build_subsets()=0;
Expand Down
125 changes: 105 additions & 20 deletions src/shogun/features/Features.cpp
Expand Up @@ -50,27 +50,32 @@ CFeatures::CFeatures(CFile* loader)
CFeatures::~CFeatures()
{
clean_preprocessors();
delete m_subset;
SG_UNREF(m_active_subset);
SG_UNREF(m_subset_stack);
}

void
CFeatures::init()
{
m_parameters->add(&properties, "properties",
"Feature properties.");
m_parameters->add(&cache_size, "cache_size",
"Size of cache in MB.");
SG_ADD(&properties, "properties", "Feature properties", MS_NOT_AVAILABLE);
SG_ADD(&cache_size, "cache_size", "Size of cache in MB", MS_NOT_AVAILABLE);

m_parameters->add_vector((CSGObject***) &preproc,
&num_preproc, "preproc",
"List of preprocessors.");
m_parameters->add_vector(&preprocessed,
&num_preproc, "preprocessed",
"Feature[i] is already preprocessed.");
/* TODO, use SGVector for arrays to be able to use SG_ADD macro */
m_parameters->add_vector((CSGObject***) &preproc, &num_preproc, "preproc",
"List of preprocessors");
m_parameters->add_vector(&preprocessed, &num_preproc, "preprocessed",
"Feature[i] is already preprocessed");

m_parameters->add((CSGObject**)&m_subset, "subset", "Subset object");
SG_ADD((CSGObject**)&m_active_subset, "active_subset", "Subset object",
MS_NOT_AVAILABLE);

m_subset=NULL;
SG_ADD((CSGObject**)&m_subset_stack, "subset_stack",
"Stack of subsets of indices", MS_NOT_AVAILABLE);

m_subset_stack=new CList(true);
SG_REF(m_subset_stack);

m_active_subset=NULL;
properties = FP_NONE;
cache_size = 0;
preproc = NULL;
Expand Down Expand Up @@ -349,22 +354,102 @@ void CFeatures::unset_property(EFeatureProperty p)
properties &= (properties | p) ^ p;
}

void CFeatures::set_subset(CSubset* subset)
void CFeatures::add_subset(CSubset* subset)
{
SG_UNREF(m_subset);
m_subset=subset;
SG_REF(subset);
/* do some basic consistency checks */
if (!subset)
SG_ERROR("CFeatures::add_subset(NULL) is illegal.\n");

/* check for legal size (only possible if there is already a subset) */
if (has_subsets())
{
index_t available=m_active_subset->get_size();
if (subset->get_size()>available)
SG_ERROR("Pushed subset contains more indices than available.\n");

if (subset->get_max_index()>= available)
SG_ERROR("Pushed subset contains index out of bounds (too large).\n");
}

m_subset_stack->push(subset);
update_active_subset();
subset_changed_post();
}

bool CFeatures::has_subset() const
bool CFeatures::has_subsets() const
{
return m_subset!=NULL;
return m_subset_stack->get_num_elements();
}

void CFeatures::remove_subset()
{
set_subset(NULL);
m_subset_stack->pop();
update_active_subset();
subset_changed_post();
}

void CFeatures::remove_all_subsets()
{
m_subset_stack->delete_all_elements();
update_active_subset();
subset_changed_post();
}

void CFeatures::update_active_subset()
{
/* delete active subset and rebuild from subset stack */
SG_UNREF(m_active_subset);

/* important since this might the first subset from stack */
m_active_subset=NULL;

index_t num_subsets=m_subset_stack->get_num_elements();
if (num_subsets)
{
/* if there is only one subset, use that as current active */
if (num_subsets==1)
{
/* this automatically SG_REFs */
m_active_subset=(CSubset*)m_subset_stack->get_first_element();
}
else
{
/* current_indices will contain the "real" indices which are translated
* iteratively through all stacked subsets. start with last subset */
CSubset* current_subset=(CSubset*)m_subset_stack->get_last_element();
SGVector<index_t> current_indices=SGVector<index_t>(
current_subset->get_size());
for (index_t i=0; i<current_indices.vlen; ++i)
current_indices.vector[i]=current_subset->subset_idx_conversion(i);

SG_UNREF(current_subset);
current_subset=(CSubset*)m_subset_stack->get_previous_element();

/* now remaining subsets */
while(current_subset)
{
SGVector<index_t> new_indices=SGVector<index_t>(
current_subset->get_size());

/* translate current real indices through current subset */
for (index_t i=0; i<current_indices.vlen; ++i)
{
new_indices.vector[i]=current_subset->subset_idx_conversion(
current_indices.vector[i]);
}

/* replace current real indices */
current_indices.destroy_vector();
current_indices=SGVector<index_t>(new_indices);

/* next subset */
SG_UNREF(current_subset);
current_subset=(CSubset*)m_subset_stack->get_next_element();
}
m_active_subset=new CSubset(current_indices);
SG_REF(m_active_subset);
}
}
}

CFeatures* CFeatures::copy_subset(SGVector<index_t> indices)
Expand Down

0 comments on commit b7b8742

Please sign in to comment.