ngscopeclient · Aug 8, 2020 · Aug 8, 2020
Showing with 167 additions and 15 deletions.

+132 −15 scopehal/LeCroyOscilloscope.cpp

+5 −0 scopehal/LeCroyOscilloscope.h

+24 −0 scopehal/scopehal.cpp

+6 −0 scopehal/scopehal.h
diff --git a/scopehal/LeCroyOscilloscope.cpp b/scopehal/LeCroyOscilloscope.cpp
@@ -32,6 +32,8 @@
 #include "ProtocolDecoder.h"
 #include "base64.h"
 #include <locale>
+#include <xmmintrin.h>
+#include <immintrin.h>
 
 using namespace std;
 
@@ -1292,32 +1294,29 @@ vector<WaveformBase*> LeCroyOscilloscope::ProcessAnalogWaveform(
 		else
 			cap->m_startPicoseconds = static_cast<int64_t>(basetime * 1e12f);
 
-		//Convert raw ADC samples to volts
 		cap->Resize(num_per_segment);
-		int64_t* offs = reinterpret_cast<int64_t*>(&cap->m_offsets[0]);
-		int64_t* durs = reinterpret_cast<int64_t*>(&cap->m_durations[0]);
+
+		//Fill durations and offsets
+		if(g_hasAvx2)
+			FillWaveformHeadersAVX2((int64_t*)&cap->m_offsets[0], (int64_t*)&cap->m_durations[0], num_per_segment);
+		else
+			FillWaveformHeaders((int64_t*)&cap->m_offsets[0], (int64_t*)&cap->m_durations[0], num_per_segment);
+
+		//Convert raw ADC samples to volts
 		float* samps = reinterpret_cast<float*>(&cap->m_samples[0]);
 		if(m_highDefinition)
 		{
 			int16_t* base = wdata + j*num_per_segment;
 
 			for(unsigned int k=0; k<num_per_segment; k++)
-			{
-				offs[k] = k;
-				durs[k] = 1;
 				samps[k] = base[k] * v_gain - v_off;
-			}
 		}
 		else
 		{
-			int8_t* base = bdata + j*num_per_segment;
-
-			for(unsigned int k=0; k<num_per_segment; k++)
-			{
-				offs[k] = k;
-				durs[k] = 1;
-				samps[k] = base[k] * v_gain - v_off;
-			}
+			if(g_hasAvx2)
+				Convert8BitSamplesAVX2(samps, bdata + j*num_per_segment, v_gain, v_off, num_per_segment);
+			else
+				Convert8BitSamples(samps, bdata + j*num_per_segment, v_gain, v_off, num_per_segment);
 		}
 
 		ret.push_back(cap);
@@ -1326,6 +1325,123 @@ vector<WaveformBase*> LeCroyOscilloscope::ProcessAnalogWaveform(
 	return ret;
 }
 
+/**
+	@brief Converts 8-bit ADC samples to floating point
+ */
+void LeCroyOscilloscope::Convert8BitSamples(float* pout, int8_t* pin, float gain, float offset, size_t count)
+{
+	for(unsigned int k=0; k<count; k++)
+		pout[k] = pin[k] * gain - offset;
+}
+
+/**
+	@brief Optimized version of Convert8BitSamples()
+ */
+__attribute__((target("avx2")))
+void LeCroyOscilloscope::Convert8BitSamplesAVX2(float* pout, int8_t* pin, float gain, float offset, size_t count)
+{
+	unsigned int end = count - (count % 32);
+
+	__m256 gains = { gain, gain, gain, gain, gain, gain, gain, gain };
+	__m256 offsets = { offset, offset, offset, offset, offset, offset, offset, offset };
+
+	for(unsigned int k=0; k<count; k += 32)
+	{
+		//This is likely a lot faster, but assumes we have 64 byte alignment on pin which is not guaranteed.
+		//TODO: fix alignment
+		//__m256i raw_samples = _mm256_load_si256(reinterpret_cast<__m256i*>(pin + k));
+
+		//Load all 32 raw ADC samples, without assuming alignment
+		__m256i raw_samples = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k));
+
+		//Extract the low and high 16 samples from the block
+		__m128i block01_x8 = _mm256_extracti128_si256(raw_samples, 0);
+		__m128i block23_x8 = _mm256_extracti128_si256(raw_samples, 1);
+
+		//Swap the low and high halves of these vectors
+		//Ugly casting needed because all permute instrinsics expect float/double datatypes
+		__m128i block10_x8 = _mm_castpd_si128(_mm_permute_pd(_mm_castsi128_pd(block01_x8), 1));
+		__m128i block32_x8 = _mm_castpd_si128(_mm_permute_pd(_mm_castsi128_pd(block23_x8), 1));
+
+		//Divide into blocks of 8 samples and sign extend to 32 bit
+		__m256i block0_int = _mm256_cvtepi8_epi32(block01_x8);
+		__m256i block1_int = _mm256_cvtepi8_epi32(block10_x8);
+		__m256i block2_int = _mm256_cvtepi8_epi32(block23_x8);
+		__m256i block3_int = _mm256_cvtepi8_epi32(block32_x8);
+
+		//Convert the 32-bit int blocks to float.
+		//Apparently there's no direct epi8 to ps conversion instruction.
+		__m256 block0_float = _mm256_cvtepi32_ps(block0_int);
+		__m256 block1_float = _mm256_cvtepi32_ps(block1_int);
+		__m256 block2_float = _mm256_cvtepi32_ps(block2_int);
+		__m256 block3_float = _mm256_cvtepi32_ps(block3_int);
+
+		//Woo! We've finally got floating point data. Now we can do the fun part.
+		block0_float = _mm256_mul_ps(block0_float, gains);
+		block1_float = _mm256_mul_ps(block1_float, gains);
+		block2_float = _mm256_mul_ps(block2_float, gains);
+		block3_float = _mm256_mul_ps(block3_float, gains);
+
+		block0_float = _mm256_sub_ps(block0_float, offsets);
+		block1_float = _mm256_sub_ps(block1_float, offsets);
+		block2_float = _mm256_sub_ps(block2_float, offsets);
+		block3_float = _mm256_sub_ps(block3_float, offsets);
+
+		//All done, store back to the output buffer
+		_mm256_store_ps(pout + k, 		block0_float);
+		_mm256_store_ps(pout + k + 8,	block1_float);
+		_mm256_store_ps(pout + k + 16,	block2_float);
+		_mm256_store_ps(pout + k + 24,	block3_float);
+	}
+
+	//Get any extras we didn't get in the SIMD loop
+	for(unsigned int k=end; k<count; k++)
+		pout[k] = pin[k] * gain - offset;
+}
+
+/**
+	@brief Fills offsets with 0...n and durations to all 1s
+ */
+void LeCroyOscilloscope::FillWaveformHeaders(int64_t* offs, int64_t* durs, size_t count)
+{
+	for(unsigned int k=0; k<count; k++)
+	{
+		offs[k] = k;
+		durs[k] = 1;
+	}
+}
+
+/**
+	@brief Optimized version of FillWaveformHeaders()
+ */
+__attribute__((target("avx2")))
+void LeCroyOscilloscope::FillWaveformHeadersAVX2(int64_t* offs, int64_t* durs, size_t count)
+{
+	int64_t ones_x4[] = {1, 1, 1, 1};
+	int64_t fours_x4[] = {4, 4, 4, 4};
+	int64_t count_x4[] = {0, 1, 2, 3};
+
+	__m256i all_ones = _mm256_load_si256(reinterpret_cast<__m256i*>(ones_x4));
+	__m256i all_fours = _mm256_load_si256(reinterpret_cast<__m256i*>(fours_x4));
+	__m256i counts = _mm256_load_si256(reinterpret_cast<__m256i*>(count_x4));
+
+	//Unroll four stores to each array per loop iteration
+	unsigned int end = count - (count % 4);
+	for(unsigned int k=0; k<end; k+= 4)
+	{
+		_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k), all_ones);
+		_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k), counts);
+		counts = _mm256_add_epi64(counts, all_fours);
+	}
+
+	//Get any extras we didn't get in the SIMD loop
+	for(unsigned int k=end; k<count; k++)
+	{
+		offs[k] = k;
+		durs[k] = 1;
+	}
+}
+
 map<int, DigitalWaveform*> LeCroyOscilloscope::ProcessDigitalWaveform(
 	string& data,
 	time_t ttime,
@@ -1515,6 +1631,7 @@ bool LeCroyOscilloscope::AcquireData(bool toQueue)
 	//Process analog waveforms
 	vector< vector<WaveformBase*> > waveforms;
 	waveforms.resize(m_analogChannelCount);
+	#pragma omp parallel for
 	for(unsigned int i=0; i<m_analogChannelCount; i++)
 	{
 		if(enabled[i])

diff --git a/scopehal/LeCroyOscilloscope.h b/scopehal/LeCroyOscilloscope.h
@@ -197,6 +197,11 @@ class LeCroyOscilloscope
 		time_t ttime,
 		double basetime);
 
+	void FillWaveformHeaders(int64_t* offs, int64_t* durs, size_t count);
+	void FillWaveformHeadersAVX2(int64_t* offs, int64_t* durs, size_t count);
+	void Convert8BitSamples(float* pout, int8_t* pin, float gain, float offset, size_t count);
+	void Convert8BitSamplesAVX2(float* pout, int8_t* pin, float gain, float offset, size_t count);
+
 	//hardware analog channel count, independent of LA option etc
 	unsigned int m_analogChannelCount;
 	unsigned int m_digitalChannelCount;

diff --git a/scopehal/scopehal.cpp b/scopehal/scopehal.cpp
@@ -54,6 +54,10 @@
 
 using namespace std;
 
+bool g_hasAvx512F = false;
+bool g_hasAvx512DQ = false;
+bool g_hasAvx512VL = false;
+bool g_hasAvx2 = false;
 
 /**
 	@brief Static initialization for SCPI transports
@@ -71,11 +75,31 @@ void TransportStaticInit()
 #endif
 }
 
+void DetectCPUFeatures()
+{
+	//Check CPU features
+	g_hasAvx512F = __builtin_cpu_supports("avx512f");
+	g_hasAvx512VL = __builtin_cpu_supports("avx512vl");
+	g_hasAvx512DQ = __builtin_cpu_supports("avx512dq");
+	g_hasAvx2 = __builtin_cpu_supports("avx2");
+
+	if(g_hasAvx2)
+		LogDebug("CPU supports AVX2\n");
+	if(g_hasAvx512F)
+		LogDebug("CPU supports AVX512F\n");
+	if(g_hasAvx512DQ)
+		LogDebug("CPU supports AVX512DQ\n");
+	if(g_hasAvx512VL)
+		LogDebug("CPU supports AVX512VL\n");
+}
+
 /**
 	@brief Static initialization for oscilloscopes
  */
 void DriverStaticInit()
 {
+	DetectCPUFeatures();
+
 	AddDriverClass(AgilentOscilloscope);
 	AddDriverClass(AntikernelLabsOscilloscope);
 	AddDriverClass(AntikernelLogicAnalyzer);

diff --git a/scopehal/scopehal.h b/scopehal/scopehal.h
@@ -85,5 +85,11 @@ void TransportStaticInit();
 void DriverStaticInit();
 
 void InitializePlugins();
+void DetectCPUFeatures();
+
+extern bool g_hasAvx512F;
+extern bool g_hasAvx512VL;
+extern bool g_hasAvx512DQ;
+extern bool g_hasAvx2;
 
 #endif