Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 1e7ba074cfd0
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 30ed2c393219
Choose a head ref
  • 2 commits
  • 3 files changed
  • 1 contributor

Commits on Nov 30, 2020

  1. Copy the full SHA
    625dc6c View commit details
  2. Copy the full SHA
    30ed2c3 View commit details
Showing with 166 additions and 16 deletions.
  1. +1 −1 scopeprotocols/EmphasisRemovalFilter.cpp
  2. +146 −13 scopeprotocols/TappedDelayLineFilter.cpp
  3. +19 −2 scopeprotocols/TappedDelayLineFilter.h
2 changes: 1 addition & 1 deletion scopeprotocols/EmphasisRemovalFilter.cpp
Original file line number Diff line number Diff line change
@@ -180,7 +180,7 @@ void EmphasisRemovalFilter::Refresh()
//Run the actual filter
float vmin;
float vmax;
TappedDelayLineFilter::DoFilterKernel(tap_count, tap_delay, taps, din, cap, vmin, vmax);
TappedDelayLineFilter::DoFilterKernel(tap_delay, taps, din, cap, vmin, vmax);

//Calculate bounds
m_max = max(m_max, vmax);
159 changes: 146 additions & 13 deletions scopeprotocols/TappedDelayLineFilter.cpp
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@

#include "scopeprotocols.h"
#include "TappedDelayLineFilter.h"
#include <immintrin.h>

using namespace std;

@@ -38,7 +39,6 @@ using namespace std;
TappedDelayLineFilter::TappedDelayLineFilter(const string& color)
: Filter(OscilloscopeChannel::CHANNEL_TYPE_ANALOG, color, CAT_MATH)
, m_tapDelayName("Tap Delay")
, m_tapCountName("Tap Count")
, m_tap0Name("Tap Value 0")
, m_tap1Name("Tap Value 1")
, m_tap2Name("Tap Value 2")
@@ -58,9 +58,6 @@ TappedDelayLineFilter::TappedDelayLineFilter(const string& color)
m_parameters[m_tapDelayName] = FilterParameter(FilterParameter::TYPE_INT, Unit(Unit::UNIT_FS));
m_parameters[m_tapDelayName].SetIntVal(200000);

m_parameters[m_tapCountName] = FilterParameter(FilterParameter::TYPE_INT, Unit(Unit::UNIT_COUNTS));
m_parameters[m_tapCountName].SetIntVal(1);

m_parameters[m_tap0Name] = FilterParameter(FilterParameter::TYPE_FLOAT, Unit(Unit::UNIT_COUNTS));
m_parameters[m_tap0Name].SetFloatVal(1);

@@ -176,8 +173,6 @@ void TappedDelayLineFilter::Refresh()

//Get the tap config
int64_t tap_delay = m_parameters[m_tapDelayName].GetIntVal();
int64_t tap_count = m_parameters[m_tapCountName].GetIntVal();
tap_count = min(tap_count, (int64_t)8);

//Extract tap values
float taps[8] =
@@ -195,7 +190,7 @@ void TappedDelayLineFilter::Refresh()
//Run the actual filter
float vmin;
float vmax;
DoFilterKernel(tap_count, tap_delay, taps, din, cap, vmin, vmax);
DoFilterKernel(tap_delay, taps, din, cap, vmin, vmax);

//Calculate bounds
m_max = max(m_max, vmax);
@@ -205,7 +200,20 @@ void TappedDelayLineFilter::Refresh()
}

void TappedDelayLineFilter::DoFilterKernel(
int64_t tap_count,
int64_t tap_delay,
float* taps,
AnalogWaveform* din,
AnalogWaveform* cap,
float& vmin,
float& vmax)
{
if(g_hasAvx2)
DoFilterKernelAVX2(tap_delay, taps, din, cap, vmin, vmax);
else
DoFilterKernelGeneric(tap_delay, taps, din, cap, vmin, vmax);
}

void TappedDelayLineFilter::DoFilterKernelGeneric(
int64_t tap_delay,
float* taps,
AnalogWaveform* din,
@@ -223,21 +231,146 @@ void TappedDelayLineFilter::DoFilterKernel(
size_t filterlen = 8*samples_per_tap;
size_t end = len - filterlen;
cap->Resize(end);
int64_t jstart = 8 - tap_count;

//Copy the timestamps
memcpy(&cap->m_offsets[0], &din->m_offsets[filterlen], end*sizeof(int64_t));
memcpy(&cap->m_durations[0], &din->m_durations[filterlen], end*sizeof(int64_t));

//Do the filter
//#pragma omp parallel for
for(size_t i=0; i<end; i++)
{
float v = 0;
for(int64_t j=jstart; j<tap_count; j++)
for(int64_t j=0; j<8; j++)
v += din->m_samples[i + j*samples_per_tap] * taps[7 - j];

vmin = min(vmin, v);
vmax = max(vmax, v);

cap->m_offsets[i] = din->m_offsets[i+filterlen];
cap->m_durations[i] = din->m_durations[i+filterlen];
cap->m_samples[i] = v;
}
}

__attribute__((target("avx2")))
void TappedDelayLineFilter::DoFilterKernelAVX2(
int64_t tap_delay,
float* taps,
AnalogWaveform* din,
AnalogWaveform* cap,
float& vmin,
float& vmax)
{
//For now, no resampling. Assume tap delay is an integer number of samples.
int64_t samples_per_tap = tap_delay / cap->m_timescale;

//Setup
vmin = FLT_MAX;
vmax = -FLT_MAX;
size_t len = din->m_samples.size();
size_t filterlen = 8*samples_per_tap;
size_t end = len - filterlen;
cap->Resize(end);

//Copy the timestamps
memcpy(&cap->m_offsets[0], &din->m_offsets[filterlen], end*sizeof(int64_t));
memcpy(&cap->m_durations[0], &din->m_durations[filterlen], end*sizeof(int64_t));

//Reverse the taps
float taps_reversed[8] =
{ taps[7], taps[6], taps[5], taps[4], taps[3], taps[2], taps[1], taps[0] };

//I/O pointers
float* pin = (float*)&din->m_samples[0];
float* pout = (float*)&cap->m_samples[0];
size_t end_rounded = end - (end % 8);
size_t i=0;

__m256 vmin_x8 = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
__m256 vmax_x8 = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX };

//Vector loop.
//The filter is hard to vectorize because of striding.
//So rather than vectorizing the inner loop, we unroll it and vectorize 8 output samples at a time.
for(; i<end_rounded; i += 8)
{
//Load the first half of the inputs and coefficients
//The first sample is guaranteed to be aligned. Subsequent ones may not be.
//7 clock latency for all loads/broadcasts on Skylake,
//and 2 IPC throughput.
float* base = pin + i;
__m256 vin0 = _mm256_load_ps(base);
__m256 tap0 = _mm256_broadcast_ss(&taps_reversed[0]);
__m256 vin1 = _mm256_loadu_ps(base + samples_per_tap);
__m256 tap1 = _mm256_broadcast_ss(&taps_reversed[1]);
__m256 vin2 = _mm256_loadu_ps(base + 2*samples_per_tap);
__m256 tap2 = _mm256_broadcast_ss(&taps_reversed[2]);
__m256 vin3 = _mm256_loadu_ps(base + 3*samples_per_tap);
__m256 tap3 = _mm256_broadcast_ss(&taps_reversed[3]);

//Calculate the results for the first half
__m256 prod0 = _mm256_mul_ps(vin0, tap0);
__m256 prod1 = _mm256_mul_ps(vin1, tap1);
__m256 prod2 = _mm256_mul_ps(vin2, tap2);
__m256 prod3 = _mm256_mul_ps(vin3, tap3);
__m256 v01 = _mm256_add_ps(prod0, prod1);
__m256 v23 = _mm256_add_ps(prod2, prod3);

//Now we can load the second half and repeat the process
__m256 vin4 = _mm256_loadu_ps(base + 4*samples_per_tap);
__m256 tap4 = _mm256_broadcast_ss(&taps_reversed[4]);
__m256 vin5 = _mm256_loadu_ps(base + 5*samples_per_tap);
__m256 tap5 = _mm256_broadcast_ss(&taps_reversed[5]);
__m256 vin6 = _mm256_loadu_ps(base + 6*samples_per_tap);
__m256 tap6 = _mm256_broadcast_ss(&taps_reversed[6]);
__m256 vin7 = _mm256_loadu_ps(base + 7*samples_per_tap);
__m256 tap7 = _mm256_broadcast_ss(&taps_reversed[7]);

//Calculate the results for the first half
__m256 prod4 = _mm256_mul_ps(vin4, tap4);
__m256 prod5 = _mm256_mul_ps(vin5, tap5);
__m256 prod6 = _mm256_mul_ps(vin6, tap6);
__m256 prod7 = _mm256_mul_ps(vin7, tap7);
__m256 v45 = _mm256_add_ps(prod4, prod5);
__m256 v67 = _mm256_add_ps(prod6, prod7);

//Final summations
__m256 v03 = _mm256_add_ps(v01, v23);
__m256 v47 = _mm256_add_ps(v45, v67);
__m256 sum = _mm256_add_ps(v03, v47);

//Store the output
_mm256_store_ps(pout + i, sum);

//Calculate min/max
vmin_x8 = _mm256_min_ps(vmin_x8, sum);
vmax_x8 = _mm256_max_ps(vmax_x8, sum);
}

//Horizontal reduction of vector min/max
float tmp_min[8] __attribute__((aligned(32)));
float tmp_max[8] __attribute__((aligned(32)));
_mm256_store_ps(tmp_min, vmin_x8);
_mm256_store_ps(tmp_max, vmax_x8);
for(int j=0; j<8; j++)
{
vmin = min(vmin, tmp_min[j]);
vmax = max(vmax, tmp_max[j]);
}

//Catch stragglers at the end
for(; i<end; i++)
{
float v = pin[i] * taps_reversed[0];
v += pin[i + 1*samples_per_tap] * taps_reversed[1];
v += pin[i + 2*samples_per_tap] * taps_reversed[2];
v += pin[i + 3*samples_per_tap] * taps_reversed[3];
v += pin[i + 4*samples_per_tap] * taps_reversed[4];
v += pin[i + 5*samples_per_tap] * taps_reversed[5];
v += pin[i + 6*samples_per_tap] * taps_reversed[6];
v += pin[i + 7*samples_per_tap] * taps_reversed[7];

vmin = min(vmin, v);
vmax = max(vmax, v);

cap->m_samples[i] = v;
}
}
21 changes: 19 additions & 2 deletions scopeprotocols/TappedDelayLineFilter.h
Original file line number Diff line number Diff line change
@@ -35,6 +35,9 @@
#ifndef TappedDelayLineFilter_h
#define TappedDelayLineFilter_h

/**
@brief Performs an 8-tap FIR filter. The delay must be an integer multiple of the sampling period.
*/
class TappedDelayLineFilter : public Filter
{
public:
@@ -58,7 +61,6 @@ class TappedDelayLineFilter : public Filter
PROTOCOL_DECODER_INITPROC(TappedDelayLineFilter)

static void DoFilterKernel(
int64_t tap_count,
int64_t tap_delay,
float* taps,
AnalogWaveform* din,
@@ -68,13 +70,28 @@ class TappedDelayLineFilter : public Filter

protected:

static void DoFilterKernelGeneric(
int64_t tap_delay,
float* taps,
AnalogWaveform* din,
AnalogWaveform* cap,
float& vmin,
float& vmax);

static void DoFilterKernelAVX2(
int64_t tap_delay,
float* taps,
AnalogWaveform* din,
AnalogWaveform* cap,
float& vmin,
float& vmax);

float m_min;
float m_max;
float m_range;
float m_offset;

std::string m_tapDelayName;
std::string m_tapCountName;
std::string m_tap0Name;
std::string m_tap1Name;
std::string m_tap2Name;