Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 3b9fc6c45c0a
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 5f083bb11116
Choose a head ref
  • 3 commits
  • 2 files changed
  • 1 contributor

Commits on Aug 8, 2020

  1. Copy the full SHA
    39993e7 View commit details

Commits on Aug 9, 2020

  1. Copy the full SHA
    6c83322 View commit details
  2. Copy the full SHA
    5f083bb View commit details
Showing with 118 additions and 57 deletions.
  1. +114 −53 scopehal/LeCroyOscilloscope.cpp
  2. +4 −4 scopehal/LeCroyOscilloscope.h
167 changes: 114 additions & 53 deletions scopehal/LeCroyOscilloscope.cpp
Original file line number Diff line number Diff line change
@@ -32,8 +32,8 @@
#include "ProtocolDecoder.h"
#include "base64.h"
#include <locale>
#include <xmmintrin.h>
#include <immintrin.h>
#include <omp.h>

using namespace std;

@@ -1296,27 +1296,80 @@ vector<WaveformBase*> LeCroyOscilloscope::ProcessAnalogWaveform(

cap->Resize(num_per_segment);

//Fill durations and offsets
if(g_hasAvx2)
FillWaveformHeadersAVX2((int64_t*)&cap->m_offsets[0], (int64_t*)&cap->m_durations[0], num_per_segment);
else
FillWaveformHeaders((int64_t*)&cap->m_offsets[0], (int64_t*)&cap->m_durations[0], num_per_segment);

//Convert raw ADC samples to volts
//TODO: Optimized AVX conversion for 16-bit samples
float* samps = reinterpret_cast<float*>(&cap->m_samples[0]);
if(m_highDefinition)
{
int16_t* base = wdata + j*num_per_segment;

for(unsigned int k=0; k<num_per_segment; k++)
{
cap->m_offsets[k] = k;
cap->m_durations[k] = 1;
samps[k] = base[k] * v_gain - v_off;
}
}
else
{
if(g_hasAvx2)
Convert8BitSamplesAVX2(samps, bdata + j*num_per_segment, v_gain, v_off, num_per_segment);
{
//Divide large waveforms (>1M points) into blocks and multithread them
//TODO: tune split
if(num_per_segment > 1000000)
{
//Round blocks to multiples of 32 samples for clean vectorization
size_t numblocks = omp_get_max_threads() / 2; //don't run on all hyperthreads
size_t lastblock = numblocks - 1;
size_t blocksize = num_per_segment / numblocks;
blocksize = blocksize - (blocksize % 32);

#pragma omp parallel for
for(size_t i=0; i<numblocks; i++)
{
//Last block gets any extra that didn't divide evenly
size_t nsamp = blocksize;
if(i == lastblock)
nsamp = num_per_segment - i*blocksize;

Convert8BitSamplesAVX2(
(int64_t*)&cap->m_offsets[i*blocksize],
(int64_t*)&cap->m_durations[i*blocksize],
samps + i*blocksize,
bdata + j*num_per_segment + i*blocksize,
v_gain,
v_off,
nsamp,
i*blocksize);
}
}

//Small waveforms get done single threaded to avoid overhead
else
{
Convert8BitSamplesAVX2(
(int64_t*)&cap->m_offsets[0],
(int64_t*)&cap->m_durations[0],
samps,
bdata + j*num_per_segment,
v_gain,
v_off,
num_per_segment,
0);
}
}
else
Convert8BitSamples(samps, bdata + j*num_per_segment, v_gain, v_off, num_per_segment);
{
Convert8BitSamples(
(int64_t*)&cap->m_offsets[0],
(int64_t*)&cap->m_durations[0],
samps,
bdata + j*num_per_segment,
v_gain,
v_off,
num_per_segment,
0);
}
}

ret.push_back(cap);
@@ -1328,20 +1381,40 @@ vector<WaveformBase*> LeCroyOscilloscope::ProcessAnalogWaveform(
/**
@brief Converts 8-bit ADC samples to floating point
*/
void LeCroyOscilloscope::Convert8BitSamples(float* pout, int8_t* pin, float gain, float offset, size_t count)
void LeCroyOscilloscope::Convert8BitSamples(
int64_t* offs, int64_t* durs, float* pout, int8_t* pin, float gain, float offset, size_t count, int64_t ibase)
{
for(unsigned int k=0; k<count; k++)
{
offs[k] = ibase + k;
durs[k] = 1;
pout[k] = pin[k] * gain - offset;
}
}

/**
@brief Optimized version of Convert8BitSamples()
*/
__attribute__((target("avx2")))
void LeCroyOscilloscope::Convert8BitSamplesAVX2(float* pout, int8_t* pin, float gain, float offset, size_t count)
void LeCroyOscilloscope::Convert8BitSamplesAVX2(
int64_t* offs, int64_t* durs, float* pout, int8_t* pin, float gain, float offset, size_t count, int64_t ibase)
{
unsigned int end = count - (count % 32);

int64_t ones_x4[] = {1, 1, 1, 1};
int64_t fours_x4[] = {4, 4, 4, 4};
int64_t count_x4[] =
{
ibase + 0,
ibase + 1,
ibase + 2,
ibase + 3
};

__m256i all_ones = _mm256_load_si256(reinterpret_cast<__m256i*>(ones_x4));
__m256i all_fours = _mm256_load_si256(reinterpret_cast<__m256i*>(fours_x4));
__m256i counts = _mm256_load_si256(reinterpret_cast<__m256i*>(count_x4));

__m256 gains = { gain, gain, gain, gain, gain, gain, gain, gain };
__m256 offsets = { offset, offset, offset, offset, offset, offset, offset, offset };

@@ -1354,6 +1427,16 @@ void LeCroyOscilloscope::Convert8BitSamplesAVX2(float* pout, int8_t* pin, float
//Load all 32 raw ADC samples, without assuming alignment
__m256i raw_samples = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k));

//Fill duration
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 4), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 8), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 12), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 16), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 20), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 24), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 28), all_ones);

//Extract the low and high 16 samples from the block
__m128i block01_x8 = _mm256_extracti128_si256(raw_samples, 0);
__m128i block23_x8 = _mm256_extracti128_si256(raw_samples, 1);
@@ -1369,6 +1452,24 @@ void LeCroyOscilloscope::Convert8BitSamplesAVX2(float* pout, int8_t* pin, float
__m256i block2_int = _mm256_cvtepi8_epi32(block23_x8);
__m256i block3_int = _mm256_cvtepi8_epi32(block32_x8);

//Fill offset
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 4), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 8), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 12), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 16), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 20), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 24), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 28), counts);
counts = _mm256_add_epi64(counts, all_fours);

//Convert the 32-bit int blocks to float.
//Apparently there's no direct epi8 to ps conversion instruction.
__m256 block0_float = _mm256_cvtepi32_ps(block0_int);
@@ -1394,51 +1495,12 @@ void LeCroyOscilloscope::Convert8BitSamplesAVX2(float* pout, int8_t* pin, float
_mm256_store_ps(pout + k + 24, block3_float);
}

//Get any extras we didn't get in the SIMD loop
for(unsigned int k=end; k<count; k++)
pout[k] = pin[k] * gain - offset;
}

/**
@brief Fills offsets with 0...n and durations to all 1s
*/
void LeCroyOscilloscope::FillWaveformHeaders(int64_t* offs, int64_t* durs, size_t count)
{
for(unsigned int k=0; k<count; k++)
{
offs[k] = k;
durs[k] = 1;
}
}

/**
@brief Optimized version of FillWaveformHeaders()
*/
__attribute__((target("avx2")))
void LeCroyOscilloscope::FillWaveformHeadersAVX2(int64_t* offs, int64_t* durs, size_t count)
{
int64_t ones_x4[] = {1, 1, 1, 1};
int64_t fours_x4[] = {4, 4, 4, 4};
int64_t count_x4[] = {0, 1, 2, 3};

__m256i all_ones = _mm256_load_si256(reinterpret_cast<__m256i*>(ones_x4));
__m256i all_fours = _mm256_load_si256(reinterpret_cast<__m256i*>(fours_x4));
__m256i counts = _mm256_load_si256(reinterpret_cast<__m256i*>(count_x4));

//Unroll four stores to each array per loop iteration
unsigned int end = count - (count % 4);
for(unsigned int k=0; k<end; k+= 4)
{
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k), counts);
counts = _mm256_add_epi64(counts, all_fours);
}

//Get any extras we didn't get in the SIMD loop
for(unsigned int k=end; k<count; k++)
{
offs[k] = k;
offs[k] = ibase + k;
durs[k] = 1;
pout[k] = pin[k] * gain - offset;
}
}

@@ -1631,7 +1693,6 @@ bool LeCroyOscilloscope::AcquireData(bool toQueue)
//Process analog waveforms
vector< vector<WaveformBase*> > waveforms;
waveforms.resize(m_analogChannelCount);
#pragma omp parallel for
for(unsigned int i=0; i<m_analogChannelCount; i++)
{
if(enabled[i])
8 changes: 4 additions & 4 deletions scopehal/LeCroyOscilloscope.h
Original file line number Diff line number Diff line change
@@ -197,10 +197,10 @@ class LeCroyOscilloscope
time_t ttime,
double basetime);

void FillWaveformHeaders(int64_t* offs, int64_t* durs, size_t count);
void FillWaveformHeadersAVX2(int64_t* offs, int64_t* durs, size_t count);
void Convert8BitSamples(float* pout, int8_t* pin, float gain, float offset, size_t count);
void Convert8BitSamplesAVX2(float* pout, int8_t* pin, float gain, float offset, size_t count);
void Convert8BitSamples(
int64_t* offs, int64_t* durs, float* pout, int8_t* pin, float gain, float offset, size_t count, int64_t ibase);
void Convert8BitSamplesAVX2(
int64_t* offs, int64_t* durs, float* pout, int8_t* pin, float gain, float offset, size_t count, int64_t ibase);

//hardware analog channel count, independent of LA option etc
unsigned int m_analogChannelCount;