ngscopeclient · Aug 10, 2020 · Aug 10, 2020
Showing with 199 additions and 88 deletions.

+186 −87 scopeprotocols/DeEmbedDecoder.cpp

+13 −1 scopeprotocols/DeEmbedDecoder.h
diff --git a/scopeprotocols/DeEmbedDecoder.cpp b/scopeprotocols/DeEmbedDecoder.cpp
@@ -29,6 +29,7 @@
 
 #include "../scopehal/scopehal.h"
 #include "DeEmbedDecoder.h"
+#include <immintrin.h>
 
 using namespace std;
 
@@ -55,22 +56,37 @@ DeEmbedDecoder::DeEmbedDecoder(string color)
 
 	m_forwardPlan = NULL;
 	m_reversePlan = NULL;
+
 	m_cachedNumPoints = 0;
+	m_cachedRawSize = 0;
+
+	m_forwardInBuf = NULL;
+	m_forwardOutBuf = NULL;
+	m_reverseOutBuf = NULL;
 }
 
 DeEmbedDecoder::~DeEmbedDecoder()
 {
 	if(m_forwardPlan)
-	{
 		ffts_free(m_forwardPlan);
-		m_forwardPlan = NULL;
-	}
-
 	if(m_reversePlan)
-	{
 		ffts_free(m_reversePlan);
-		m_reversePlan = NULL;
-	}
+
+	#ifdef _WIN32
+		_aligned_free(m_forwardInBuf);
+		_aligned_free(m_forwardOutBuf);
+		_aligned_free(m_reverseOutBuf);
+	#else
+		free(m_forwardInBuf);
+		free(m_forwardOutBuf);
+		free(m_reverseOutBuf);
+	#endif
+
+	m_forwardPlan = NULL;
+	m_reversePlan = NULL;
+	m_forwardInBuf = NULL;
+	m_forwardOutBuf = NULL;
+	m_reverseOutBuf = NULL;
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,7 +199,10 @@ void DeEmbedDecoder::DoRefresh(bool invert)
 
 		//Clear out cached S-parameters
 		m_cachedBinSize = 0;
-		m_resampledSparams.clear();
+		m_resampledSparamPhases.clear();
+		m_resampledSparamCosines.clear();
+		m_resampledSparamSines.clear();
+		m_resampledSparamAmplitudes.clear();
 	}
 
 	//Don't die if the file couldn't be loaded
@@ -207,33 +226,11 @@ void DeEmbedDecoder::DoRefresh(bool invert)
 	//LogTrace("Rounded to %zu\n", npoints);
 
 	//Format the input data as raw samples for the FFT
-	//TODO: handle non-uniform sample rates
-	float* rdin;
-	size_t insize = npoints * sizeof(float);
-
-#ifdef _WIN32
-	rdin = (float*)_aligned_malloc(insize, 32);
-#else
-	posix_memalign((void**)&rdin, 32, insize);
-#endif
-
-	//Copy the input, then fill any extra space with zeroes
-	memcpy(rdin, &din->m_samples[0], npoints_raw*sizeof(float));
-	for(size_t i=npoints_raw; i<npoints; i++)
-		rdin[i] = 0;
+	//TODO: handle non-uniform sample rates and resample?
+	size_t nouts = npoints/2 + 1;
 
-	//Set up the FFT
-	float* rdout;
-	const size_t nouts = npoints/2 + 1;
-
-#ifdef _WIN32
-	rdout = (float*)_aligned_malloc(2 * nouts * sizeof(float), 32);
-#else
-	posix_memalign((void**)&rdout, 32, 2 * nouts * sizeof(float));
-#endif
-
-	//Set up the FFT if we change point count
-	if(m_cachedNumPoints != npoints)
+	//Set up the FFT and allocate buffers if we change point count
+	if( (m_cachedNumPoints != npoints) || (m_cachedRawSize != npoints_raw) )
 	{
 		if(m_forwardPlan)
 			ffts_free(m_forwardPlan);
@@ -243,92 +240,93 @@ void DeEmbedDecoder::DoRefresh(bool invert)
 			ffts_free(m_reversePlan);
 		m_reversePlan = ffts_init_1d_real(npoints, FFTS_BACKWARD);
 
+		#ifdef _WIN32
+			m_forwardInBuf = (float*)_aligned_malloc(npoints * sizeof(float), 32);
+			m_forwardOutBuf = (float*)_aligned_malloc(2 * nouts * sizeof(float), 32);
+			m_reverseOutBuf = (float*)_aligned_malloc(npoints * sizeof(float), 32);
+		#else
+			posix_memalign((void**)&m_forwardInBuf, 32, npoints * sizeof(float));
+			posix_memalign((void**)&m_forwardOutBuf, 32, 2 * nouts * sizeof(float));
+			posix_memalign((void**)&m_reverseOutBuf, 32, npoints * sizeof(float));
+		#endif
+
 		m_cachedNumPoints = npoints;
+		m_cachedRawSize = npoints_raw;
 	}
 
-	//Do the actual FFT
-	ffts_execute(m_forwardPlan, &rdin[0], &rdout[0]);
+	//Copy the input, then fill any extra space with zeroes
+	memcpy(m_forwardInBuf, &din->m_samples[0], npoints_raw*sizeof(float));
+	for(size_t i=npoints_raw; i<npoints; i++)
+		m_forwardInBuf[i] = 0;
+
+	//Do the forward FFT
+	ffts_execute(m_forwardPlan, &m_forwardInBuf[0], &m_forwardOutBuf[0]);
 
 	//Calculate size of each bin
 	double ps = din->m_timescale * (din->m_offsets[1] - din->m_offsets[0]);
 	double sample_ghz = 1000 / ps;
 	double bin_hz = round((0.5f * sample_ghz * 1e9f) / nouts);
 
-	//Resample S21 to our FFT bin size
+	//Resample S21 to our FFT bin size if needed
 	if(fabs(m_cachedBinSize - bin_hz) > FLT_EPSILON)
 	{
 		m_cachedBinSize = bin_hz;
 
 		for(size_t i=0; i<nouts; i++)
-			m_resampledSparams.push_back( m_sparams.SamplePoint(2, 1, bin_hz * i) );
+		{
+			auto point = m_sparams.SamplePoint(2, 1, bin_hz * i);
+
+			m_resampledSparamPhases.push_back(point.m_phase);
+			m_resampledSparamSines.push_back(sin(point.m_phase));
+			m_resampledSparamCosines.push_back(cos(point.m_phase));
+			m_resampledSparamAmplitudes.push_back(point.m_amplitude);
+		}
 	}
 
 	//Do the actual de-embed
 	if(invert)
 	{
 		for(size_t i=0; i<nouts; i++)
 		{
-			auto &point = m_resampledSparams[i];
+			float amplitude = m_resampledSparamAmplitudes[i];
+			float phase = m_resampledSparamPhases[i];
 
 			//Zero channel response = flatten rather than dividing by zero
-			if(fabs(point.m_amplitude) < FLT_EPSILON)
+			if(fabs(amplitude) < FLT_EPSILON)
 			{
-				rdout[i*2 + 0] = 0;
-				rdout[i*2 + 1] = 0;
+				m_forwardOutBuf[i*2 + 0] = 0;
+				m_forwardOutBuf[i*2 + 1] = 0;
 				continue;
 			}
 
-			float cosval = cos(-point.m_phase);
-			float sinval = sin(-point.m_phase);
+			float cosval = cos(-phase);
+			float sinval = sin(-phase);
 
 			//Uncorrected complex value
-			float real_orig = rdout[i*2 + 0];
-			float imag_orig = rdout[i*2 + 1];
+			float real_orig = m_forwardOutBuf[i*2 + 0];
+			float imag_orig = m_forwardOutBuf[i*2 + 1];
 
 			//Phase correction
 			float real = real_orig*cosval - imag_orig*sinval;
 			float imag = real_orig*sinval + imag_orig*cosval;
 
 			//Amplitude correction
-			rdout[i*2 + 0] = real / point.m_amplitude;
-			rdout[i*2 + 1] = imag / point.m_amplitude;
+			m_forwardOutBuf[i*2 + 0] = real / amplitude;
+			m_forwardOutBuf[i*2 + 1] = imag / amplitude;
 		}
 	}
 
 	//Forward channel emulation
 	else
 	{
-		for(size_t i=0; i<nouts; i++)
-		{
-			auto &point = m_resampledSparams[i];
-
-			float cosval = cos(point.m_phase);
-			float sinval = sin(point.m_phase);
-
-			//Uncorrected complex value
-			float real_orig = rdout[i*2 + 0];
-			float imag_orig = rdout[i*2 + 1];
-
-			//Phase correction
-			float real = real_orig*cosval - imag_orig*sinval;
-			float imag = real_orig*sinval + imag_orig*cosval;
-
-			//Amplitude correction
-			rdout[i*2 + 0] = real * point.m_amplitude;
-			rdout[i*2 + 1] = imag * point.m_amplitude;
-		}
+		if(g_hasAvx2)
+			ForwardMainLoopAVX2(nouts);
+		else
+			ForwardMainLoop(nouts);
 	}
 
-	//Set up the inverse FFT
-	float* ddout;
-#ifdef _WIN32
-	ddout = (float*)_aligned_malloc(insize * sizeof(float), 32);
-#else
-	posix_memalign((void**)&ddout, 32, insize * sizeof(float));
-#endif
-
 	//Calculate the inverse FFT
-	ffts_execute(m_reversePlan, &rdout[0], &ddout[0]);
+	ffts_execute(m_reversePlan, &m_forwardOutBuf[0], &m_reverseOutBuf[0]);
 
 	//Calculate maximum group delay for the first few S-parameter bins (approx propagation delay of the channel)
 	auto& s21 = m_sparams[SPair(2,1)];
@@ -360,7 +358,7 @@ void DeEmbedDecoder::DoRefresh(bool invert)
 	{
 		cap->m_offsets.push_back(din->m_offsets[i]);
 		cap->m_durations.push_back(din->m_durations[i]);
-		float v = ddout[i] * scale;
+		float v = m_reverseOutBuf[i] * scale;
 		vmin = min(v, vmin);
 		vmax = max(v, vmax);
 		cap->m_samples.push_back(v);
@@ -373,15 +371,116 @@ void DeEmbedDecoder::DoRefresh(bool invert)
 	m_offset = -( (m_max - m_min)/2 + m_min );
 
 	SetData(cap);
+}
+
+void DeEmbedDecoder::ForwardMainLoop(size_t nouts)
+{
+	for(size_t i=0; i<nouts; i++)
+	{
+		float amplitude = m_resampledSparamAmplitudes[i];
+		float phase = m_resampledSparamPhases[i];
+
+		float cosval = cos(phase);
+		float sinval = sin(phase);
+
+		//Uncorrected complex value
+		float real_orig = m_forwardOutBuf[i*2 + 0];
+		float imag_orig = m_forwardOutBuf[i*2 + 1];
+
+		//Phase correction
+		float real = real_orig*cosval - imag_orig*sinval;
+		float imag = real_orig*sinval + imag_orig*cosval;
 
-	//Clean up
-#ifdef _WIN32
-	_aligned_free(rdin);
-	_aligned_free(rdout);
-	_aligned_free(ddout);
-#else
-	free(rdin);
-	free(rdout);
-	free(ddout);
-#endif
+		//Amplitude correction
+		m_forwardOutBuf[i*2 + 0] = real * amplitude;
+		m_forwardOutBuf[i*2 + 1] = imag * amplitude;
+	}
+}
+
+__attribute__((target("avx2")))
+void DeEmbedDecoder::ForwardMainLoopAVX2(size_t nouts)
+{
+	unsigned int end = nouts - (nouts % 8);
+
+	//Vectorized loop doing 8 elements at once
+	for(size_t i=0; i<end; i += 8)
+	{
+		//Load S-parameters
+		//Precompute sin/cos since there's no AVX instruction to do this
+		__m256 amplitude = _mm256_load_ps(&m_resampledSparamAmplitudes[i]);
+		__m256 sinval = _mm256_load_ps(&m_resampledSparamSines[i]);
+		__m256 cosval = _mm256_load_ps(&m_resampledSparamCosines[i]);
+
+		//Load uncorrected complex values (interleaved real/imag real/imag)
+		__m256 din0 = _mm256_load_ps(&m_forwardOutBuf[i*2]);
+		__m256 din1 = _mm256_load_ps(&m_forwardOutBuf[i*2 + 8]);
+
+		//Original state of each block is riririri.
+		//Shuffle them around to get all the reals and imaginaries together.
+
+		//Step 1: Shuffle 32-bit values within 128-bit lanes to get rriirrii rriirrii.
+		din0 = _mm256_permute_ps(din0, 0xd8);
+		din1 = _mm256_permute_ps(din1, 0xd8);
+
+		//Step 2: Shuffle 64-bit values to get rrrriiii rrrriiii.
+		__m256i block0 = _mm256_permute4x64_epi64(_mm256_castps_si256(din0), 0xd8);
+		__m256i block1 = _mm256_permute4x64_epi64(_mm256_castps_si256(din1), 0xd8);
+
+		//Step 3: Shuffle 128-bit values to get rrrrrrrr iiiiiiii.
+		__m256 real = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x20));
+		__m256 imag = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x31));
+
+		//Create the sin/cos matrix
+		__m256 real_sin = _mm256_mul_ps(real, sinval);
+		__m256 real_cos = _mm256_mul_ps(real, cosval);
+		__m256 imag_sin = _mm256_mul_ps(imag, sinval);
+		__m256 imag_cos = _mm256_mul_ps(imag, cosval);
+
+		//Do the phase correction
+		real = _mm256_sub_ps(real_cos, imag_sin);
+		imag = _mm256_add_ps(real_sin, imag_cos);
+
+		//Amplitude correction
+		real = _mm256_mul_ps(real, amplitude);
+		imag = _mm256_mul_ps(imag, amplitude);
+
+		//Math is done, now we need to shuffle them back
+		//Shuffle 128-bit values to get rrrriiii rrrriiii.
+		block0 = _mm256_permute2x128_si256(_mm256_castps_si256(real), _mm256_castps_si256(imag), 0x20);
+		block1 = _mm256_permute2x128_si256(_mm256_castps_si256(real), _mm256_castps_si256(imag), 0x31);
+
+		//Shuffle 64-bit values to get rriirrii
+		block0 = _mm256_permute4x64_epi64(block0, 0xd8);
+		block1 = _mm256_permute4x64_epi64(block1, 0xd8);
+
+		//Shuffle 32-bit values to get the final value ready for writeback
+		din0 =_mm256_permute_ps(_mm256_castsi256_ps(block0), 0xd8);
+		din1 =_mm256_permute_ps(_mm256_castsi256_ps(block1), 0xd8);
+
+		//Write back output
+		_mm256_store_ps(&m_forwardOutBuf[i*2], din0);
+		_mm256_store_ps(&m_forwardOutBuf[i*2] + 8, din1);
+	}
+
+	//Do any leftovers
+	for(size_t i=end; i<nouts; i++)
+	{
+		float amplitude = m_resampledSparamAmplitudes[i];
+		float phase = m_resampledSparamPhases[i];
+
+		float cosval = cos(phase);
+		float sinval = sin(phase);
+
+		//Uncorrected complex value
+		float real_orig = m_forwardOutBuf[i*2 + 0];
+		float imag_orig = m_forwardOutBuf[i*2 + 1];
+
+		//Phase correction
+		float real = real_orig*cosval - imag_orig*sinval;
+		float imag = real_orig*sinval + imag_orig*cosval;
+
+		//Amplitude correction
+		m_forwardOutBuf[i*2 + 0] = real * amplitude;
+		m_forwardOutBuf[i*2 + 1] = imag * amplitude;
+	}
 }