Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: ede70ff08b1f
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: f150b8e8a989
Choose a head ref
  • 2 commits
  • 2 files changed
  • 1 contributor

Commits on Aug 10, 2020

  1. Verified

    This commit was signed with the committer’s verified signature.
    makenowjust Hiroya Fujinami
    Copy the full SHA
    005d422 View commit details
  2. DeEmbedDecoder: AVX2 implementation of inner loop (forward direction …

    …only for now) is 7.8x faster than non-vectorized version
    azonenberg committed Aug 10, 2020
    Copy the full SHA
    f150b8e View commit details
Showing with 199 additions and 88 deletions.
  1. +186 −87 scopeprotocols/DeEmbedDecoder.cpp
  2. +13 −1 scopeprotocols/DeEmbedDecoder.h
273 changes: 186 additions & 87 deletions scopeprotocols/DeEmbedDecoder.cpp
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@

#include "../scopehal/scopehal.h"
#include "DeEmbedDecoder.h"
#include <immintrin.h>

using namespace std;

@@ -55,22 +56,37 @@ DeEmbedDecoder::DeEmbedDecoder(string color)

m_forwardPlan = NULL;
m_reversePlan = NULL;

m_cachedNumPoints = 0;
m_cachedRawSize = 0;

m_forwardInBuf = NULL;
m_forwardOutBuf = NULL;
m_reverseOutBuf = NULL;
}

DeEmbedDecoder::~DeEmbedDecoder()
{
if(m_forwardPlan)
{
ffts_free(m_forwardPlan);
m_forwardPlan = NULL;
}

if(m_reversePlan)
{
ffts_free(m_reversePlan);
m_reversePlan = NULL;
}

#ifdef _WIN32
_aligned_free(m_forwardInBuf);
_aligned_free(m_forwardOutBuf);
_aligned_free(m_reverseOutBuf);
#else
free(m_forwardInBuf);
free(m_forwardOutBuf);
free(m_reverseOutBuf);
#endif

m_forwardPlan = NULL;
m_reversePlan = NULL;
m_forwardInBuf = NULL;
m_forwardOutBuf = NULL;
m_reverseOutBuf = NULL;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,7 +199,10 @@ void DeEmbedDecoder::DoRefresh(bool invert)

//Clear out cached S-parameters
m_cachedBinSize = 0;
m_resampledSparams.clear();
m_resampledSparamPhases.clear();
m_resampledSparamCosines.clear();
m_resampledSparamSines.clear();
m_resampledSparamAmplitudes.clear();
}

//Don't die if the file couldn't be loaded
@@ -207,33 +226,11 @@ void DeEmbedDecoder::DoRefresh(bool invert)
//LogTrace("Rounded to %zu\n", npoints);

//Format the input data as raw samples for the FFT
//TODO: handle non-uniform sample rates
float* rdin;
size_t insize = npoints * sizeof(float);

#ifdef _WIN32
rdin = (float*)_aligned_malloc(insize, 32);
#else
posix_memalign((void**)&rdin, 32, insize);
#endif

//Copy the input, then fill any extra space with zeroes
memcpy(rdin, &din->m_samples[0], npoints_raw*sizeof(float));
for(size_t i=npoints_raw; i<npoints; i++)
rdin[i] = 0;
//TODO: handle non-uniform sample rates and resample?
size_t nouts = npoints/2 + 1;

//Set up the FFT
float* rdout;
const size_t nouts = npoints/2 + 1;

#ifdef _WIN32
rdout = (float*)_aligned_malloc(2 * nouts * sizeof(float), 32);
#else
posix_memalign((void**)&rdout, 32, 2 * nouts * sizeof(float));
#endif

//Set up the FFT if we change point count
if(m_cachedNumPoints != npoints)
//Set up the FFT and allocate buffers if we change point count
if( (m_cachedNumPoints != npoints) || (m_cachedRawSize != npoints_raw) )
{
if(m_forwardPlan)
ffts_free(m_forwardPlan);
@@ -243,92 +240,93 @@ void DeEmbedDecoder::DoRefresh(bool invert)
ffts_free(m_reversePlan);
m_reversePlan = ffts_init_1d_real(npoints, FFTS_BACKWARD);

#ifdef _WIN32
m_forwardInBuf = (float*)_aligned_malloc(npoints * sizeof(float), 32);
m_forwardOutBuf = (float*)_aligned_malloc(2 * nouts * sizeof(float), 32);
m_reverseOutBuf = (float*)_aligned_malloc(npoints * sizeof(float), 32);
#else
posix_memalign((void**)&m_forwardInBuf, 32, npoints * sizeof(float));
posix_memalign((void**)&m_forwardOutBuf, 32, 2 * nouts * sizeof(float));
posix_memalign((void**)&m_reverseOutBuf, 32, npoints * sizeof(float));
#endif

m_cachedNumPoints = npoints;
m_cachedRawSize = npoints_raw;
}

//Do the actual FFT
ffts_execute(m_forwardPlan, &rdin[0], &rdout[0]);
//Copy the input, then fill any extra space with zeroes
memcpy(m_forwardInBuf, &din->m_samples[0], npoints_raw*sizeof(float));
for(size_t i=npoints_raw; i<npoints; i++)
m_forwardInBuf[i] = 0;

//Do the forward FFT
ffts_execute(m_forwardPlan, &m_forwardInBuf[0], &m_forwardOutBuf[0]);

//Calculate size of each bin
double ps = din->m_timescale * (din->m_offsets[1] - din->m_offsets[0]);
double sample_ghz = 1000 / ps;
double bin_hz = round((0.5f * sample_ghz * 1e9f) / nouts);

//Resample S21 to our FFT bin size
//Resample S21 to our FFT bin size if needed
if(fabs(m_cachedBinSize - bin_hz) > FLT_EPSILON)
{
m_cachedBinSize = bin_hz;

for(size_t i=0; i<nouts; i++)
m_resampledSparams.push_back( m_sparams.SamplePoint(2, 1, bin_hz * i) );
{
auto point = m_sparams.SamplePoint(2, 1, bin_hz * i);

m_resampledSparamPhases.push_back(point.m_phase);
m_resampledSparamSines.push_back(sin(point.m_phase));
m_resampledSparamCosines.push_back(cos(point.m_phase));
m_resampledSparamAmplitudes.push_back(point.m_amplitude);
}
}

//Do the actual de-embed
if(invert)
{
for(size_t i=0; i<nouts; i++)
{
auto &point = m_resampledSparams[i];
float amplitude = m_resampledSparamAmplitudes[i];
float phase = m_resampledSparamPhases[i];

//Zero channel response = flatten rather than dividing by zero
if(fabs(point.m_amplitude) < FLT_EPSILON)
if(fabs(amplitude) < FLT_EPSILON)
{
rdout[i*2 + 0] = 0;
rdout[i*2 + 1] = 0;
m_forwardOutBuf[i*2 + 0] = 0;
m_forwardOutBuf[i*2 + 1] = 0;
continue;
}

float cosval = cos(-point.m_phase);
float sinval = sin(-point.m_phase);
float cosval = cos(-phase);
float sinval = sin(-phase);

//Uncorrected complex value
float real_orig = rdout[i*2 + 0];
float imag_orig = rdout[i*2 + 1];
float real_orig = m_forwardOutBuf[i*2 + 0];
float imag_orig = m_forwardOutBuf[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Amplitude correction
rdout[i*2 + 0] = real / point.m_amplitude;
rdout[i*2 + 1] = imag / point.m_amplitude;
m_forwardOutBuf[i*2 + 0] = real / amplitude;
m_forwardOutBuf[i*2 + 1] = imag / amplitude;
}
}

//Forward channel emulation
else
{
for(size_t i=0; i<nouts; i++)
{
auto &point = m_resampledSparams[i];

float cosval = cos(point.m_phase);
float sinval = sin(point.m_phase);

//Uncorrected complex value
float real_orig = rdout[i*2 + 0];
float imag_orig = rdout[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Amplitude correction
rdout[i*2 + 0] = real * point.m_amplitude;
rdout[i*2 + 1] = imag * point.m_amplitude;
}
if(g_hasAvx2)
ForwardMainLoopAVX2(nouts);
else
ForwardMainLoop(nouts);
}

//Set up the inverse FFT
float* ddout;
#ifdef _WIN32
ddout = (float*)_aligned_malloc(insize * sizeof(float), 32);
#else
posix_memalign((void**)&ddout, 32, insize * sizeof(float));
#endif

//Calculate the inverse FFT
ffts_execute(m_reversePlan, &rdout[0], &ddout[0]);
ffts_execute(m_reversePlan, &m_forwardOutBuf[0], &m_reverseOutBuf[0]);

//Calculate maximum group delay for the first few S-parameter bins (approx propagation delay of the channel)
auto& s21 = m_sparams[SPair(2,1)];
@@ -360,7 +358,7 @@ void DeEmbedDecoder::DoRefresh(bool invert)
{
cap->m_offsets.push_back(din->m_offsets[i]);
cap->m_durations.push_back(din->m_durations[i]);
float v = ddout[i] * scale;
float v = m_reverseOutBuf[i] * scale;
vmin = min(v, vmin);
vmax = max(v, vmax);
cap->m_samples.push_back(v);
@@ -373,15 +371,116 @@ void DeEmbedDecoder::DoRefresh(bool invert)
m_offset = -( (m_max - m_min)/2 + m_min );

SetData(cap);
}

void DeEmbedDecoder::ForwardMainLoop(size_t nouts)
{
for(size_t i=0; i<nouts; i++)
{
float amplitude = m_resampledSparamAmplitudes[i];
float phase = m_resampledSparamPhases[i];

float cosval = cos(phase);
float sinval = sin(phase);

//Uncorrected complex value
float real_orig = m_forwardOutBuf[i*2 + 0];
float imag_orig = m_forwardOutBuf[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Clean up
#ifdef _WIN32
_aligned_free(rdin);
_aligned_free(rdout);
_aligned_free(ddout);
#else
free(rdin);
free(rdout);
free(ddout);
#endif
//Amplitude correction
m_forwardOutBuf[i*2 + 0] = real * amplitude;
m_forwardOutBuf[i*2 + 1] = imag * amplitude;
}
}

__attribute__((target("avx2")))
void DeEmbedDecoder::ForwardMainLoopAVX2(size_t nouts)
{
unsigned int end = nouts - (nouts % 8);

//Vectorized loop doing 8 elements at once
for(size_t i=0; i<end; i += 8)
{
//Load S-parameters
//Precompute sin/cos since there's no AVX instruction to do this
__m256 amplitude = _mm256_load_ps(&m_resampledSparamAmplitudes[i]);
__m256 sinval = _mm256_load_ps(&m_resampledSparamSines[i]);
__m256 cosval = _mm256_load_ps(&m_resampledSparamCosines[i]);

//Load uncorrected complex values (interleaved real/imag real/imag)
__m256 din0 = _mm256_load_ps(&m_forwardOutBuf[i*2]);
__m256 din1 = _mm256_load_ps(&m_forwardOutBuf[i*2 + 8]);

//Original state of each block is riririri.
//Shuffle them around to get all the reals and imaginaries together.

//Step 1: Shuffle 32-bit values within 128-bit lanes to get rriirrii rriirrii.
din0 = _mm256_permute_ps(din0, 0xd8);
din1 = _mm256_permute_ps(din1, 0xd8);

//Step 2: Shuffle 64-bit values to get rrrriiii rrrriiii.
__m256i block0 = _mm256_permute4x64_epi64(_mm256_castps_si256(din0), 0xd8);
__m256i block1 = _mm256_permute4x64_epi64(_mm256_castps_si256(din1), 0xd8);

//Step 3: Shuffle 128-bit values to get rrrrrrrr iiiiiiii.
__m256 real = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x20));
__m256 imag = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x31));

//Create the sin/cos matrix
__m256 real_sin = _mm256_mul_ps(real, sinval);
__m256 real_cos = _mm256_mul_ps(real, cosval);
__m256 imag_sin = _mm256_mul_ps(imag, sinval);
__m256 imag_cos = _mm256_mul_ps(imag, cosval);

//Do the phase correction
real = _mm256_sub_ps(real_cos, imag_sin);
imag = _mm256_add_ps(real_sin, imag_cos);

//Amplitude correction
real = _mm256_mul_ps(real, amplitude);
imag = _mm256_mul_ps(imag, amplitude);

//Math is done, now we need to shuffle them back
//Shuffle 128-bit values to get rrrriiii rrrriiii.
block0 = _mm256_permute2x128_si256(_mm256_castps_si256(real), _mm256_castps_si256(imag), 0x20);
block1 = _mm256_permute2x128_si256(_mm256_castps_si256(real), _mm256_castps_si256(imag), 0x31);

//Shuffle 64-bit values to get rriirrii
block0 = _mm256_permute4x64_epi64(block0, 0xd8);
block1 = _mm256_permute4x64_epi64(block1, 0xd8);

//Shuffle 32-bit values to get the final value ready for writeback
din0 =_mm256_permute_ps(_mm256_castsi256_ps(block0), 0xd8);
din1 =_mm256_permute_ps(_mm256_castsi256_ps(block1), 0xd8);

//Write back output
_mm256_store_ps(&m_forwardOutBuf[i*2], din0);
_mm256_store_ps(&m_forwardOutBuf[i*2] + 8, din1);
}

//Do any leftovers
for(size_t i=end; i<nouts; i++)
{
float amplitude = m_resampledSparamAmplitudes[i];
float phase = m_resampledSparamPhases[i];

float cosval = cos(phase);
float sinval = sin(phase);

//Uncorrected complex value
float real_orig = m_forwardOutBuf[i*2 + 0];
float imag_orig = m_forwardOutBuf[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Amplitude correction
m_forwardOutBuf[i*2 + 0] = real * amplitude;
m_forwardOutBuf[i*2 + 1] = imag * amplitude;
}
}
Loading