Skip to content

Commit

Permalink
Merge pull request #318 from frx/streaming_vw
Browse files Browse the repository at this point in the history
Better documentation and some issues resolved in StreamingSparseFeatures
  • Loading branch information
Soeren Sonnenburg committed Aug 28, 2011
2 parents 04d6769 + 5219f4d commit 0deb8ca
Showing 1 changed file with 40 additions and 20 deletions.
60 changes: 40 additions & 20 deletions src/shogun/features/StreamingSparseFeatures.h
Expand Up @@ -18,10 +18,27 @@

namespace shogun
{
/** @brief This class implements streaming features with dense feature vectors.
/** @brief This class implements streaming features with sparse feature vectors.
* The vector is represented as an SGSparseVector<T>. Each entry is of type
* SGSparseVectorEntry<T> with members `feat_index' and `entry'.
*
* This class expects the input from the StreamingFile object to be zero-based,
* i.e., a feature entered as 1:6.5 would have feat_index=0 and entry=6.5.
*
* The current example is stored as a combination of current_vector
* and current_label.
* current_num_features stores the highest dimensionality of examples encountered
* upto the point of the function call.
* For example, if the first example is '1:6.5 7:10.0', then current_num_features
* would be 7 after the first function call.
*
* Since the dimensionality of the feature space is not immediately known initially,
* current_num_features may increase as more examples are processed and larger
* dimensions are seen.
* For this purpose, `expand_if_required()' is provided which when called with a
* dynamically allocated float or double array and the length, reallocates that
* array to the new dimensionality (if necessary), setting the newer dimensions
* to zero, and updates the length parameter to equal the new length of the array.
*/
template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
{
Expand Down Expand Up @@ -208,11 +225,11 @@ template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
inline virtual void expand_if_required(float32_t*& vec, int32_t &len)
{
int32_t dim = get_dim_feature_space();
if (dim+1 > len)
if (dim > len)
{
vec = SG_REALLOC(float32_t, vec, dim+1);
memset(&vec[len], 0, (dim+1-len) * sizeof(float32_t));
len = dim+1;
vec = SG_REALLOC(float32_t, vec, dim);
memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
len = dim;
}
}

Expand All @@ -227,11 +244,11 @@ template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
inline virtual void expand_if_required(float64_t*& vec, int32_t &len)
{
int32_t dim = get_dim_feature_space();
if (dim+1 > len)
if (dim > len)
{
vec = SG_REALLOC(float64_t, vec, dim+1);
memset(&vec[len], 0, (dim+1-len) * sizeof(float64_t));
len = dim+1;
vec = SG_REALLOC(float64_t, vec, dim);
memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
len = dim;
}
}

Expand Down Expand Up @@ -487,6 +504,8 @@ template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures

/**
* Ensure features of the current vector are in ascending order.
* It modifies the current_vector in-place, though a temporary
* vector is created and later freed.
*/
void sort_features()
{
Expand All @@ -511,15 +530,17 @@ template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
for (int32_t i=0; i<len; i++)
sf_new[i]=sf_orig[orig_idx[i]];

current_vector=sf_new;

// sanity check
for (int32_t i=0; i<len-1; i++)
ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);

// Copy new vector back to original
for (int32_t i=0; i<len; i++)
sf_orig[i]=sf_new[i];

SG_FREE(orig_idx);
SG_FREE(feat_idx);
SG_FREE(sf_orig);
SG_FREE(sf_new);
}

/**
Expand Down Expand Up @@ -591,7 +612,7 @@ template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
* Initializes members to null values.
* current_length is set to -1.
*/
void init();
virtual void init();

/**
* Calls init, and also initializes the parser with the given args.
Expand All @@ -600,13 +621,9 @@ template <class T> class CStreamingSparseFeatures : public CStreamingDotFeatures
* @param is_labelled whether labelled or not
* @param size number of examples in the parser's ring
*/
void init(CStreamingFile *file, bool is_labelled, int32_t size);
virtual void init(CStreamingFile *file, bool is_labelled, int32_t size);

protected:

/// feature weighting in combined dot features
float32_t combined_weight;

/// The parser object, which reads from input and returns parsed example objects.
CInputParser< SGSparseVectorEntry<T> > parser;

Expand Down Expand Up @@ -707,15 +724,18 @@ bool CStreamingSparseFeatures<T>::get_next_example()
current_length,
current_label);

if (!ret_value)
return false;

// Update number of features based on highest index
for (int32_t i=0; i<current_length; i++)
{
if (current_vector[i].feat_index > current_num_features)
current_num_features = current_vector[i].feat_index+1;
}

current_vec_index++;
return ret_value;

return true;
}

template <class T>
Expand Down

0 comments on commit 0deb8ca

Please sign in to comment.