Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 9edea105d887
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 3b9fc6c45c0a
Choose a head ref
  • 2 commits
  • 4 files changed
  • 1 contributor

Commits on Aug 8, 2020

  1. Moved feature detection to scopehal. Added AVX2 implementation of som…

    …e waveform processing in LeCroyOscilloscope.
    azonenberg committed Aug 8, 2020
    Copy the full SHA
    85f8409 View commit details
  2. Copy the full SHA
    3b9fc6c View commit details
Showing with 167 additions and 15 deletions.
  1. +132 −15 scopehal/LeCroyOscilloscope.cpp
  2. +5 −0 scopehal/LeCroyOscilloscope.h
  3. +24 −0 scopehal/scopehal.cpp
  4. +6 −0 scopehal/scopehal.h
147 changes: 132 additions & 15 deletions scopehal/LeCroyOscilloscope.cpp
Original file line number Diff line number Diff line change
@@ -32,6 +32,8 @@
#include "ProtocolDecoder.h"
#include "base64.h"
#include <locale>
#include <xmmintrin.h>
#include <immintrin.h>

using namespace std;

@@ -1292,32 +1294,29 @@ vector<WaveformBase*> LeCroyOscilloscope::ProcessAnalogWaveform(
else
cap->m_startPicoseconds = static_cast<int64_t>(basetime * 1e12f);

//Convert raw ADC samples to volts
cap->Resize(num_per_segment);
int64_t* offs = reinterpret_cast<int64_t*>(&cap->m_offsets[0]);
int64_t* durs = reinterpret_cast<int64_t*>(&cap->m_durations[0]);

//Fill durations and offsets
if(g_hasAvx2)
FillWaveformHeadersAVX2((int64_t*)&cap->m_offsets[0], (int64_t*)&cap->m_durations[0], num_per_segment);
else
FillWaveformHeaders((int64_t*)&cap->m_offsets[0], (int64_t*)&cap->m_durations[0], num_per_segment);

//Convert raw ADC samples to volts
float* samps = reinterpret_cast<float*>(&cap->m_samples[0]);
if(m_highDefinition)
{
int16_t* base = wdata + j*num_per_segment;

for(unsigned int k=0; k<num_per_segment; k++)
{
offs[k] = k;
durs[k] = 1;
samps[k] = base[k] * v_gain - v_off;
}
}
else
{
int8_t* base = bdata + j*num_per_segment;

for(unsigned int k=0; k<num_per_segment; k++)
{
offs[k] = k;
durs[k] = 1;
samps[k] = base[k] * v_gain - v_off;
}
if(g_hasAvx2)
Convert8BitSamplesAVX2(samps, bdata + j*num_per_segment, v_gain, v_off, num_per_segment);
else
Convert8BitSamples(samps, bdata + j*num_per_segment, v_gain, v_off, num_per_segment);
}

ret.push_back(cap);
@@ -1326,6 +1325,123 @@ vector<WaveformBase*> LeCroyOscilloscope::ProcessAnalogWaveform(
return ret;
}

/**
@brief Converts 8-bit ADC samples to floating point
*/
void LeCroyOscilloscope::Convert8BitSamples(float* pout, int8_t* pin, float gain, float offset, size_t count)
{
for(unsigned int k=0; k<count; k++)
pout[k] = pin[k] * gain - offset;
}

/**
@brief Optimized version of Convert8BitSamples()
*/
__attribute__((target("avx2")))
void LeCroyOscilloscope::Convert8BitSamplesAVX2(float* pout, int8_t* pin, float gain, float offset, size_t count)
{
unsigned int end = count - (count % 32);

__m256 gains = { gain, gain, gain, gain, gain, gain, gain, gain };
__m256 offsets = { offset, offset, offset, offset, offset, offset, offset, offset };

for(unsigned int k=0; k<count; k += 32)
{
//This is likely a lot faster, but assumes we have 64 byte alignment on pin which is not guaranteed.
//TODO: fix alignment
//__m256i raw_samples = _mm256_load_si256(reinterpret_cast<__m256i*>(pin + k));

//Load all 32 raw ADC samples, without assuming alignment
__m256i raw_samples = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k));

//Extract the low and high 16 samples from the block
__m128i block01_x8 = _mm256_extracti128_si256(raw_samples, 0);
__m128i block23_x8 = _mm256_extracti128_si256(raw_samples, 1);

//Swap the low and high halves of these vectors
//Ugly casting needed because all permute instrinsics expect float/double datatypes
__m128i block10_x8 = _mm_castpd_si128(_mm_permute_pd(_mm_castsi128_pd(block01_x8), 1));
__m128i block32_x8 = _mm_castpd_si128(_mm_permute_pd(_mm_castsi128_pd(block23_x8), 1));

//Divide into blocks of 8 samples and sign extend to 32 bit
__m256i block0_int = _mm256_cvtepi8_epi32(block01_x8);
__m256i block1_int = _mm256_cvtepi8_epi32(block10_x8);
__m256i block2_int = _mm256_cvtepi8_epi32(block23_x8);
__m256i block3_int = _mm256_cvtepi8_epi32(block32_x8);

//Convert the 32-bit int blocks to float.
//Apparently there's no direct epi8 to ps conversion instruction.
__m256 block0_float = _mm256_cvtepi32_ps(block0_int);
__m256 block1_float = _mm256_cvtepi32_ps(block1_int);
__m256 block2_float = _mm256_cvtepi32_ps(block2_int);
__m256 block3_float = _mm256_cvtepi32_ps(block3_int);

//Woo! We've finally got floating point data. Now we can do the fun part.
block0_float = _mm256_mul_ps(block0_float, gains);
block1_float = _mm256_mul_ps(block1_float, gains);
block2_float = _mm256_mul_ps(block2_float, gains);
block3_float = _mm256_mul_ps(block3_float, gains);

block0_float = _mm256_sub_ps(block0_float, offsets);
block1_float = _mm256_sub_ps(block1_float, offsets);
block2_float = _mm256_sub_ps(block2_float, offsets);
block3_float = _mm256_sub_ps(block3_float, offsets);

//All done, store back to the output buffer
_mm256_store_ps(pout + k, block0_float);
_mm256_store_ps(pout + k + 8, block1_float);
_mm256_store_ps(pout + k + 16, block2_float);
_mm256_store_ps(pout + k + 24, block3_float);
}

//Get any extras we didn't get in the SIMD loop
for(unsigned int k=end; k<count; k++)
pout[k] = pin[k] * gain - offset;
}

/**
@brief Fills offsets with 0...n and durations to all 1s
*/
void LeCroyOscilloscope::FillWaveformHeaders(int64_t* offs, int64_t* durs, size_t count)
{
for(unsigned int k=0; k<count; k++)
{
offs[k] = k;
durs[k] = 1;
}
}

/**
@brief Optimized version of FillWaveformHeaders()
*/
__attribute__((target("avx2")))
void LeCroyOscilloscope::FillWaveformHeadersAVX2(int64_t* offs, int64_t* durs, size_t count)
{
int64_t ones_x4[] = {1, 1, 1, 1};
int64_t fours_x4[] = {4, 4, 4, 4};
int64_t count_x4[] = {0, 1, 2, 3};

__m256i all_ones = _mm256_load_si256(reinterpret_cast<__m256i*>(ones_x4));
__m256i all_fours = _mm256_load_si256(reinterpret_cast<__m256i*>(fours_x4));
__m256i counts = _mm256_load_si256(reinterpret_cast<__m256i*>(count_x4));

//Unroll four stores to each array per loop iteration
unsigned int end = count - (count % 4);
for(unsigned int k=0; k<end; k+= 4)
{
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k), counts);
counts = _mm256_add_epi64(counts, all_fours);
}

//Get any extras we didn't get in the SIMD loop
for(unsigned int k=end; k<count; k++)
{
offs[k] = k;
durs[k] = 1;
}
}

map<int, DigitalWaveform*> LeCroyOscilloscope::ProcessDigitalWaveform(
string& data,
time_t ttime,
@@ -1515,6 +1631,7 @@ bool LeCroyOscilloscope::AcquireData(bool toQueue)
//Process analog waveforms
vector< vector<WaveformBase*> > waveforms;
waveforms.resize(m_analogChannelCount);
#pragma omp parallel for
for(unsigned int i=0; i<m_analogChannelCount; i++)
{
if(enabled[i])
5 changes: 5 additions & 0 deletions scopehal/LeCroyOscilloscope.h
Original file line number Diff line number Diff line change
@@ -197,6 +197,11 @@ class LeCroyOscilloscope
time_t ttime,
double basetime);

void FillWaveformHeaders(int64_t* offs, int64_t* durs, size_t count);
void FillWaveformHeadersAVX2(int64_t* offs, int64_t* durs, size_t count);
void Convert8BitSamples(float* pout, int8_t* pin, float gain, float offset, size_t count);
void Convert8BitSamplesAVX2(float* pout, int8_t* pin, float gain, float offset, size_t count);

//hardware analog channel count, independent of LA option etc
unsigned int m_analogChannelCount;
unsigned int m_digitalChannelCount;
24 changes: 24 additions & 0 deletions scopehal/scopehal.cpp
Original file line number Diff line number Diff line change
@@ -54,6 +54,10 @@

using namespace std;

bool g_hasAvx512F = false;
bool g_hasAvx512DQ = false;
bool g_hasAvx512VL = false;
bool g_hasAvx2 = false;

/**
@brief Static initialization for SCPI transports
@@ -71,11 +75,31 @@ void TransportStaticInit()
#endif
}

void DetectCPUFeatures()
{
//Check CPU features
g_hasAvx512F = __builtin_cpu_supports("avx512f");
g_hasAvx512VL = __builtin_cpu_supports("avx512vl");
g_hasAvx512DQ = __builtin_cpu_supports("avx512dq");
g_hasAvx2 = __builtin_cpu_supports("avx2");

if(g_hasAvx2)
LogDebug("CPU supports AVX2\n");
if(g_hasAvx512F)
LogDebug("CPU supports AVX512F\n");
if(g_hasAvx512DQ)
LogDebug("CPU supports AVX512DQ\n");
if(g_hasAvx512VL)
LogDebug("CPU supports AVX512VL\n");
}

/**
@brief Static initialization for oscilloscopes
*/
void DriverStaticInit()
{
DetectCPUFeatures();

AddDriverClass(AgilentOscilloscope);
AddDriverClass(AntikernelLabsOscilloscope);
AddDriverClass(AntikernelLogicAnalyzer);
6 changes: 6 additions & 0 deletions scopehal/scopehal.h
Original file line number Diff line number Diff line change
@@ -85,5 +85,11 @@ void TransportStaticInit();
void DriverStaticInit();

void InitializePlugins();
void DetectCPUFeatures();

extern bool g_hasAvx512F;
extern bool g_hasAvx512VL;
extern bool g_hasAvx512DQ;
extern bool g_hasAvx2;

#endif