Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: ede70ff08b1f
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: f150b8e8a989
Choose a head ref
  • 2 commits
  • 2 files changed
  • 1 contributor

Commits on Aug 10, 2020

  1. Partially verified

    This commit is signed with the committer’s verified signature.
    domenkozar’s contribution has been verified via GPG key.
    We cannot verify signatures from co-authors, and some of the co-authors attributed to this commit require their commits to be signed.
    Copy the full SHA
    005d422 View commit details
  2. DeEmbedDecoder: AVX2 implementation of inner loop (forward direction …

    …only for now) is 7.8x faster than non-vectorized version
    azonenberg committed Aug 10, 2020

    Verified

    This commit was signed with the committer’s verified signature.
    domenkozar Domen Kožar
    Copy the full SHA
    f150b8e View commit details
Showing with 199 additions and 88 deletions.
  1. +186 −87 scopeprotocols/DeEmbedDecoder.cpp
  2. +13 −1 scopeprotocols/DeEmbedDecoder.h
273 changes: 186 additions & 87 deletions scopeprotocols/DeEmbedDecoder.cpp
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@

#include "../scopehal/scopehal.h"
#include "DeEmbedDecoder.h"
#include <immintrin.h>

using namespace std;

@@ -55,22 +56,37 @@ DeEmbedDecoder::DeEmbedDecoder(string color)

m_forwardPlan = NULL;
m_reversePlan = NULL;

m_cachedNumPoints = 0;
m_cachedRawSize = 0;

m_forwardInBuf = NULL;
m_forwardOutBuf = NULL;
m_reverseOutBuf = NULL;
}

DeEmbedDecoder::~DeEmbedDecoder()
{
if(m_forwardPlan)
{
ffts_free(m_forwardPlan);
m_forwardPlan = NULL;
}

if(m_reversePlan)
{
ffts_free(m_reversePlan);
m_reversePlan = NULL;
}

#ifdef _WIN32
_aligned_free(m_forwardInBuf);
_aligned_free(m_forwardOutBuf);
_aligned_free(m_reverseOutBuf);
#else
free(m_forwardInBuf);
free(m_forwardOutBuf);
free(m_reverseOutBuf);
#endif

m_forwardPlan = NULL;
m_reversePlan = NULL;
m_forwardInBuf = NULL;
m_forwardOutBuf = NULL;
m_reverseOutBuf = NULL;
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,7 +199,10 @@ void DeEmbedDecoder::DoRefresh(bool invert)

//Clear out cached S-parameters
m_cachedBinSize = 0;
m_resampledSparams.clear();
m_resampledSparamPhases.clear();
m_resampledSparamCosines.clear();
m_resampledSparamSines.clear();
m_resampledSparamAmplitudes.clear();
}

//Don't die if the file couldn't be loaded
@@ -207,33 +226,11 @@ void DeEmbedDecoder::DoRefresh(bool invert)
//LogTrace("Rounded to %zu\n", npoints);

//Format the input data as raw samples for the FFT
//TODO: handle non-uniform sample rates
float* rdin;
size_t insize = npoints * sizeof(float);

#ifdef _WIN32
rdin = (float*)_aligned_malloc(insize, 32);
#else
posix_memalign((void**)&rdin, 32, insize);
#endif

//Copy the input, then fill any extra space with zeroes
memcpy(rdin, &din->m_samples[0], npoints_raw*sizeof(float));
for(size_t i=npoints_raw; i<npoints; i++)
rdin[i] = 0;
//TODO: handle non-uniform sample rates and resample?
size_t nouts = npoints/2 + 1;

//Set up the FFT
float* rdout;
const size_t nouts = npoints/2 + 1;

#ifdef _WIN32
rdout = (float*)_aligned_malloc(2 * nouts * sizeof(float), 32);
#else
posix_memalign((void**)&rdout, 32, 2 * nouts * sizeof(float));
#endif

//Set up the FFT if we change point count
if(m_cachedNumPoints != npoints)
//Set up the FFT and allocate buffers if we change point count
if( (m_cachedNumPoints != npoints) || (m_cachedRawSize != npoints_raw) )
{
if(m_forwardPlan)
ffts_free(m_forwardPlan);
@@ -243,92 +240,93 @@ void DeEmbedDecoder::DoRefresh(bool invert)
ffts_free(m_reversePlan);
m_reversePlan = ffts_init_1d_real(npoints, FFTS_BACKWARD);

#ifdef _WIN32
m_forwardInBuf = (float*)_aligned_malloc(npoints * sizeof(float), 32);
m_forwardOutBuf = (float*)_aligned_malloc(2 * nouts * sizeof(float), 32);
m_reverseOutBuf = (float*)_aligned_malloc(npoints * sizeof(float), 32);
#else
posix_memalign((void**)&m_forwardInBuf, 32, npoints * sizeof(float));
posix_memalign((void**)&m_forwardOutBuf, 32, 2 * nouts * sizeof(float));
posix_memalign((void**)&m_reverseOutBuf, 32, npoints * sizeof(float));
#endif

m_cachedNumPoints = npoints;
m_cachedRawSize = npoints_raw;
}

//Do the actual FFT
ffts_execute(m_forwardPlan, &rdin[0], &rdout[0]);
//Copy the input, then fill any extra space with zeroes
memcpy(m_forwardInBuf, &din->m_samples[0], npoints_raw*sizeof(float));
for(size_t i=npoints_raw; i<npoints; i++)
m_forwardInBuf[i] = 0;

//Do the forward FFT
ffts_execute(m_forwardPlan, &m_forwardInBuf[0], &m_forwardOutBuf[0]);

//Calculate size of each bin
double ps = din->m_timescale * (din->m_offsets[1] - din->m_offsets[0]);
double sample_ghz = 1000 / ps;
double bin_hz = round((0.5f * sample_ghz * 1e9f) / nouts);

//Resample S21 to our FFT bin size
//Resample S21 to our FFT bin size if needed
if(fabs(m_cachedBinSize - bin_hz) > FLT_EPSILON)
{
m_cachedBinSize = bin_hz;

for(size_t i=0; i<nouts; i++)
m_resampledSparams.push_back( m_sparams.SamplePoint(2, 1, bin_hz * i) );
{
auto point = m_sparams.SamplePoint(2, 1, bin_hz * i);

m_resampledSparamPhases.push_back(point.m_phase);
m_resampledSparamSines.push_back(sin(point.m_phase));
m_resampledSparamCosines.push_back(cos(point.m_phase));
m_resampledSparamAmplitudes.push_back(point.m_amplitude);
}
}

//Do the actual de-embed
if(invert)
{
for(size_t i=0; i<nouts; i++)
{
auto &point = m_resampledSparams[i];
float amplitude = m_resampledSparamAmplitudes[i];
float phase = m_resampledSparamPhases[i];

//Zero channel response = flatten rather than dividing by zero
if(fabs(point.m_amplitude) < FLT_EPSILON)
if(fabs(amplitude) < FLT_EPSILON)
{
rdout[i*2 + 0] = 0;
rdout[i*2 + 1] = 0;
m_forwardOutBuf[i*2 + 0] = 0;
m_forwardOutBuf[i*2 + 1] = 0;
continue;
}

float cosval = cos(-point.m_phase);
float sinval = sin(-point.m_phase);
float cosval = cos(-phase);
float sinval = sin(-phase);

//Uncorrected complex value
float real_orig = rdout[i*2 + 0];
float imag_orig = rdout[i*2 + 1];
float real_orig = m_forwardOutBuf[i*2 + 0];
float imag_orig = m_forwardOutBuf[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Amplitude correction
rdout[i*2 + 0] = real / point.m_amplitude;
rdout[i*2 + 1] = imag / point.m_amplitude;
m_forwardOutBuf[i*2 + 0] = real / amplitude;
m_forwardOutBuf[i*2 + 1] = imag / amplitude;
}
}

//Forward channel emulation
else
{
for(size_t i=0; i<nouts; i++)
{
auto &point = m_resampledSparams[i];

float cosval = cos(point.m_phase);
float sinval = sin(point.m_phase);

//Uncorrected complex value
float real_orig = rdout[i*2 + 0];
float imag_orig = rdout[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Amplitude correction
rdout[i*2 + 0] = real * point.m_amplitude;
rdout[i*2 + 1] = imag * point.m_amplitude;
}
if(g_hasAvx2)
ForwardMainLoopAVX2(nouts);
else
ForwardMainLoop(nouts);
}

//Set up the inverse FFT
float* ddout;
#ifdef _WIN32
ddout = (float*)_aligned_malloc(insize * sizeof(float), 32);
#else
posix_memalign((void**)&ddout, 32, insize * sizeof(float));
#endif

//Calculate the inverse FFT
ffts_execute(m_reversePlan, &rdout[0], &ddout[0]);
ffts_execute(m_reversePlan, &m_forwardOutBuf[0], &m_reverseOutBuf[0]);

//Calculate maximum group delay for the first few S-parameter bins (approx propagation delay of the channel)
auto& s21 = m_sparams[SPair(2,1)];
@@ -360,7 +358,7 @@ void DeEmbedDecoder::DoRefresh(bool invert)
{
cap->m_offsets.push_back(din->m_offsets[i]);
cap->m_durations.push_back(din->m_durations[i]);
float v = ddout[i] * scale;
float v = m_reverseOutBuf[i] * scale;
vmin = min(v, vmin);
vmax = max(v, vmax);
cap->m_samples.push_back(v);
@@ -373,15 +371,116 @@ void DeEmbedDecoder::DoRefresh(bool invert)
m_offset = -( (m_max - m_min)/2 + m_min );

SetData(cap);
}

void DeEmbedDecoder::ForwardMainLoop(size_t nouts)
{
for(size_t i=0; i<nouts; i++)
{
float amplitude = m_resampledSparamAmplitudes[i];
float phase = m_resampledSparamPhases[i];

float cosval = cos(phase);
float sinval = sin(phase);

//Uncorrected complex value
float real_orig = m_forwardOutBuf[i*2 + 0];
float imag_orig = m_forwardOutBuf[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Clean up
#ifdef _WIN32
_aligned_free(rdin);
_aligned_free(rdout);
_aligned_free(ddout);
#else
free(rdin);
free(rdout);
free(ddout);
#endif
//Amplitude correction
m_forwardOutBuf[i*2 + 0] = real * amplitude;
m_forwardOutBuf[i*2 + 1] = imag * amplitude;
}
}

__attribute__((target("avx2")))
void DeEmbedDecoder::ForwardMainLoopAVX2(size_t nouts)
{
unsigned int end = nouts - (nouts % 8);

//Vectorized loop doing 8 elements at once
for(size_t i=0; i<end; i += 8)
{
//Load S-parameters
//Precompute sin/cos since there's no AVX instruction to do this
__m256 amplitude = _mm256_load_ps(&m_resampledSparamAmplitudes[i]);
__m256 sinval = _mm256_load_ps(&m_resampledSparamSines[i]);
__m256 cosval = _mm256_load_ps(&m_resampledSparamCosines[i]);

//Load uncorrected complex values (interleaved real/imag real/imag)
__m256 din0 = _mm256_load_ps(&m_forwardOutBuf[i*2]);
__m256 din1 = _mm256_load_ps(&m_forwardOutBuf[i*2 + 8]);

//Original state of each block is riririri.
//Shuffle them around to get all the reals and imaginaries together.

//Step 1: Shuffle 32-bit values within 128-bit lanes to get rriirrii rriirrii.
din0 = _mm256_permute_ps(din0, 0xd8);
din1 = _mm256_permute_ps(din1, 0xd8);

//Step 2: Shuffle 64-bit values to get rrrriiii rrrriiii.
__m256i block0 = _mm256_permute4x64_epi64(_mm256_castps_si256(din0), 0xd8);
__m256i block1 = _mm256_permute4x64_epi64(_mm256_castps_si256(din1), 0xd8);

//Step 3: Shuffle 128-bit values to get rrrrrrrr iiiiiiii.
__m256 real = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x20));
__m256 imag = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x31));

//Create the sin/cos matrix
__m256 real_sin = _mm256_mul_ps(real, sinval);
__m256 real_cos = _mm256_mul_ps(real, cosval);
__m256 imag_sin = _mm256_mul_ps(imag, sinval);
__m256 imag_cos = _mm256_mul_ps(imag, cosval);

//Do the phase correction
real = _mm256_sub_ps(real_cos, imag_sin);
imag = _mm256_add_ps(real_sin, imag_cos);

//Amplitude correction
real = _mm256_mul_ps(real, amplitude);
imag = _mm256_mul_ps(imag, amplitude);

//Math is done, now we need to shuffle them back
//Shuffle 128-bit values to get rrrriiii rrrriiii.
block0 = _mm256_permute2x128_si256(_mm256_castps_si256(real), _mm256_castps_si256(imag), 0x20);
block1 = _mm256_permute2x128_si256(_mm256_castps_si256(real), _mm256_castps_si256(imag), 0x31);

//Shuffle 64-bit values to get rriirrii
block0 = _mm256_permute4x64_epi64(block0, 0xd8);
block1 = _mm256_permute4x64_epi64(block1, 0xd8);

//Shuffle 32-bit values to get the final value ready for writeback
din0 =_mm256_permute_ps(_mm256_castsi256_ps(block0), 0xd8);
din1 =_mm256_permute_ps(_mm256_castsi256_ps(block1), 0xd8);

//Write back output
_mm256_store_ps(&m_forwardOutBuf[i*2], din0);
_mm256_store_ps(&m_forwardOutBuf[i*2] + 8, din1);
}

//Do any leftovers
for(size_t i=end; i<nouts; i++)
{
float amplitude = m_resampledSparamAmplitudes[i];
float phase = m_resampledSparamPhases[i];

float cosval = cos(phase);
float sinval = sin(phase);

//Uncorrected complex value
float real_orig = m_forwardOutBuf[i*2 + 0];
float imag_orig = m_forwardOutBuf[i*2 + 1];

//Phase correction
float real = real_orig*cosval - imag_orig*sinval;
float imag = real_orig*sinval + imag_orig*cosval;

//Amplitude correction
m_forwardOutBuf[i*2 + 0] = real * amplitude;
m_forwardOutBuf[i*2 + 1] = imag * amplitude;
}
}
Loading