Skip to content

Commit

Permalink
GNB: replace SGVector with SGMatrix to make the code easier to read.
Browse files Browse the repository at this point in the history
  • Loading branch information
pluskid committed Apr 9, 2012
1 parent b7b8742 commit 1179258
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 27 deletions.
Expand Up @@ -17,6 +17,13 @@ def classifier_gaussiannaivebayes_modular(fm_train_real=traindat,fm_test_real=te
gnb=GaussianNaiveBayes(feats_train, labels)
gnb_train = gnb.train()
output=gnb.apply(feats_test).get_labels()
import numpy as np
output_prev = np.loadtxt('/tmp/output.txt')
if np.all(output == output_prev):
print "The result the same as before!"
else:
print "OOps, implementation chaned!"

return gnb, gnb_train, output

if __name__=='__main__':
Expand Down
46 changes: 22 additions & 24 deletions src/shogun/classifier/GaussianNaiveBayes.cpp
Expand Up @@ -41,9 +41,9 @@ CGaussianNaiveBayes::~CGaussianNaiveBayes()
{
SG_UNREF(m_features);

m_means.destroy_vector();
m_means.destroy_matrix();
m_rates.destroy_vector();
m_variances.destroy_vector();
m_variances.destroy_matrix();
m_label_prob.destroy_vector();
};

Expand Down Expand Up @@ -99,11 +99,13 @@ bool CGaussianNaiveBayes::train(CFeatures* data)
m_dim = m_features->get_dim_feature_space();

// allocate memory for distributions' parameters and a priori probability
m_means.vector = SG_MALLOC(float64_t, m_num_classes*m_dim);
m_means.vlen = m_num_classes*m_dim;
m_means.matrix = SG_MALLOC(float64_t, m_num_classes*m_dim);
m_means.num_rows = m_dim;
m_means.num_cols = m_num_classes;

m_variances.vector = SG_MALLOC(float64_t, m_num_classes*m_dim);
m_variances.vlen = m_num_classes*m_dim;
m_variances.matrix = SG_MALLOC(float64_t, m_num_classes*m_dim);
m_variances.num_rows = m_dim;
m_variances.num_cols = m_num_classes;

m_label_prob.vector = SG_MALLOC(float64_t, m_num_classes);
m_label_prob.vlen = m_num_classes;
Expand All @@ -113,30 +115,24 @@ bool CGaussianNaiveBayes::train(CFeatures* data)
m_rates.vlen = m_num_classes;

// assure that memory is allocated
ASSERT(m_means.vector);
ASSERT(m_variances.vector);
ASSERT(m_means.matrix);
ASSERT(m_variances.matrix);
ASSERT(m_rates.vector);
ASSERT(m_label_prob.vector);

// make arrays filled by zeros before using
for (i=0;i<m_num_classes*m_dim;i++)
{
m_means.vector[i] = 0.0;
m_variances.vector[i] = 0.0;
}
for (i=0;i<m_num_classes;i++)
{
m_label_prob.vector[i] = 0.0;
m_rates.vector[i] = 0.0;
}
m_means.zero();
m_variances.zero();
m_label_prob.zero();
m_rates.zero();

SGMatrix<float64_t> feature_matrix = m_features->get_computed_dot_feature_matrix();

// get sum of features among labels
for (i=0; i<train_labels.vlen; i++)
{
for (j=0; j<m_dim; j++)
m_means.vector[m_dim*train_labels.vector[i]+j]+=feature_matrix.matrix[i*m_dim+j];
m_means(j, train_labels.vector[i]) += feature_matrix.matrix[i*m_dim+j];

m_label_prob.vector[train_labels.vector[i]]+=1.0;
}
Expand All @@ -145,22 +141,24 @@ bool CGaussianNaiveBayes::train(CFeatures* data)
for (i=0; i<m_num_classes; i++)
{
for (j=0; j<m_dim; j++)
m_means.vector[m_dim*i+j] /= m_label_prob.vector[i];
m_means(j, i) /= m_label_prob.vector[i];
}

// compute squared residuals with means available
for (i=0; i<train_labels.vlen; i++)
{
for (j=0; j<m_dim; j++)
m_variances.vector[m_dim*train_labels.vector[i]+j]+=
CMath::sq(feature_matrix.matrix[i*m_dim+j]-m_means.vector[m_dim*train_labels.vector[i]+j]);
{
m_variances(j, train_labels.vector[i]) +=
CMath::sq(feature_matrix.matrix[i*m_dim+j]-m_means(j, train_labels.vector[i]));
}
}

// get variance of features of labels
for (i=0; i<m_num_classes; i++)
{
for (j=0; j<m_dim; j++)
m_variances.vector[m_dim*i+j] /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
m_variances(j, i) /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
}

// get a priori probabilities of labels
Expand Down Expand Up @@ -225,7 +223,7 @@ float64_t CGaussianNaiveBayes::apply(int32_t idx)

// product all conditional gaussian probabilities
for (k=0; k<m_dim; k++)
m_rates.vector[i]+= CMath::log(normal_exp(feature_vector.vector[k],i,k)/CMath::sqrt(m_variances.vector[i*m_dim+k]));
m_rates.vector[i]+= CMath::log(normal_exp(feature_vector.vector[k],i,k)/CMath::sqrt(m_variances(k, i)));
}

// find label with maximum rate
Expand Down
6 changes: 3 additions & 3 deletions src/shogun/classifier/GaussianNaiveBayes.h
Expand Up @@ -110,10 +110,10 @@ class CGaussianNaiveBayes : public CMachine
int32_t m_dim;

/// means for normal distributions of features
SGVector<float64_t> m_means;
SGMatrix<float64_t> m_means;

/// variances for normal distributions of features
SGVector<float64_t> m_variances;
SGMatrix<float64_t> m_variances;

/// a priori probabilities of labels
SGVector<float64_t> m_label_prob;
Expand All @@ -126,7 +126,7 @@ class CGaussianNaiveBayes : public CMachine
*/
float64_t inline normal_exp(float64_t x, int32_t l_idx, int32_t f_idx)
{
return CMath::exp(-CMath::sq(x-m_means.vector[m_dim*l_idx+f_idx])/(2*m_variances.vector[m_dim*l_idx+f_idx]));
return CMath::exp(-CMath::sq(x-m_means(f_idx, l_idx))/(2*m_variances(f_idx, l_idx)));
}

/// label rates
Expand Down

0 comments on commit 1179258

Please sign in to comment.