Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 7ecc7ce33677
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 5b0af4207ab1
Choose a head ref
  • 4 commits
  • 2 files changed
  • 1 contributor

Commits on Dec 14, 2020

  1. Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
    Copy the full SHA
    979db5f View commit details
  2. Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
    Copy the full SHA
    065e1c5 View commit details
  3. Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
    Copy the full SHA
    576d477 View commit details
  4. Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
    Copy the full SHA
    5b0af42 View commit details
Showing with 154 additions and 2 deletions.
  1. +147 −2 scopeprotocols/FIRFilter.cpp
  2. +7 −0 scopeprotocols/FIRFilter.h
149 changes: 147 additions & 2 deletions scopeprotocols/FIRFilter.cpp
Original file line number Diff line number Diff line change
@@ -240,9 +240,9 @@ void FIRFilter::DoFilterKernel(
float& vmin,
float& vmax)
{
/*if(g_hasAvx2)
if(g_hasAvx2)
DoFilterKernelAVX2(coefficients, din, cap, vmin, vmax);
else*/
else
DoFilterKernelGeneric(coefficients, din, cap, vmin, vmax);
}

@@ -277,6 +277,151 @@ void FIRFilter::DoFilterKernelGeneric(
}
}

/**
@brief Optimized FIR implementation
Uses AVX2, but not AVX512 or FMA.
*/
__attribute__((target("avx2")))
void FIRFilter::DoFilterKernelAVX2(
vector<float>& coefficients,
AnalogWaveform* din,
AnalogWaveform* cap,
float& vmin,
float& vmax)
{
__m256 vmin_x8 = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX };
__m256 vmax_x8 = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX };

//Save some pointers and sizes
size_t len = din->m_samples.size();
size_t filterlen = coefficients.size();
size_t end = len - filterlen;
size_t end_rounded = end - (end % 64);
float* pin = (float*)&din->m_samples[0];
float* pout = (float*)&cap->m_samples[0];

//Vectorized and unrolled outer loop
size_t i=0;
for(; i<end_rounded; i += 64)
{
float* base = pin + i;

//First tap
__m256 coeff = _mm256_set1_ps(coefficients[0]);

__m256 vin_a = _mm256_loadu_ps(base + 0);
__m256 vin_b = _mm256_loadu_ps(base + 8);
__m256 vin_c = _mm256_loadu_ps(base + 16);
__m256 vin_d = _mm256_loadu_ps(base + 24);
__m256 vin_e = _mm256_loadu_ps(base + 32);
__m256 vin_f = _mm256_loadu_ps(base + 40);
__m256 vin_g = _mm256_loadu_ps(base + 48);
__m256 vin_h = _mm256_loadu_ps(base + 56);

__m256 v_a = _mm256_mul_ps(coeff, vin_a);
__m256 v_b = _mm256_mul_ps(coeff, vin_b);
__m256 v_c = _mm256_mul_ps(coeff, vin_c);
__m256 v_d = _mm256_mul_ps(coeff, vin_d);
__m256 v_e = _mm256_mul_ps(coeff, vin_e);
__m256 v_f = _mm256_mul_ps(coeff, vin_f);
__m256 v_g = _mm256_mul_ps(coeff, vin_g);
__m256 v_h = _mm256_mul_ps(coeff, vin_h);

//Subsequent taps
for(size_t j=1; j<filterlen; j++)
{
coeff = _mm256_set1_ps(coefficients[j]);

vin_a = _mm256_loadu_ps(base + j + 0);
vin_b = _mm256_loadu_ps(base + j + 8);
vin_c = _mm256_loadu_ps(base + j + 16);
vin_d = _mm256_loadu_ps(base + j + 24);
vin_e = _mm256_loadu_ps(base + j + 32);
vin_f = _mm256_loadu_ps(base + j + 40);
vin_g = _mm256_loadu_ps(base + j + 48);
vin_h = _mm256_loadu_ps(base + j + 56);

__m256 prod_a = _mm256_mul_ps(coeff, vin_a);
__m256 prod_b = _mm256_mul_ps(coeff, vin_b);
__m256 prod_c = _mm256_mul_ps(coeff, vin_c);
__m256 prod_d = _mm256_mul_ps(coeff, vin_d);
__m256 prod_e = _mm256_mul_ps(coeff, vin_e);
__m256 prod_f = _mm256_mul_ps(coeff, vin_f);
__m256 prod_g = _mm256_mul_ps(coeff, vin_g);
__m256 prod_h = _mm256_mul_ps(coeff, vin_h);

v_a = _mm256_add_ps(prod_a, v_a);
v_b = _mm256_add_ps(prod_b, v_b);
v_c = _mm256_add_ps(prod_c, v_c);
v_d = _mm256_add_ps(prod_d, v_d);
v_e = _mm256_add_ps(prod_e, v_e);
v_f = _mm256_add_ps(prod_f, v_f);
v_g = _mm256_add_ps(prod_g, v_g);
v_h = _mm256_add_ps(prod_h, v_h);
}

//Store the output
_mm256_store_ps(pout + i + 0, v_a);
_mm256_store_ps(pout + i + 8, v_b);
_mm256_store_ps(pout + i + 16, v_c);
_mm256_store_ps(pout + i + 24, v_d);
_mm256_store_ps(pout + i + 32, v_e);
_mm256_store_ps(pout + i + 40, v_f);
_mm256_store_ps(pout + i + 48, v_g);
_mm256_store_ps(pout + i + 56, v_h);

//Calculate min/max: First level
__m256 min_ab = _mm256_min_ps(v_a, v_b);
__m256 min_cd = _mm256_min_ps(v_c, v_d);
__m256 min_ef = _mm256_min_ps(v_e, v_f);
__m256 min_gh = _mm256_min_ps(v_g, v_h);

__m256 max_ab = _mm256_max_ps(v_a, v_b);
__m256 max_cd = _mm256_max_ps(v_c, v_d);
__m256 max_ef = _mm256_max_ps(v_e, v_f);
__m256 max_gh = _mm256_max_ps(v_g, v_h);

//Min/max: second level
__m256 min_abcd = _mm256_min_ps(min_ab, min_cd);
__m256 min_efgh = _mm256_min_ps(min_ef, min_gh);
__m256 max_abcd = _mm256_max_ps(max_ab, max_cd);
__m256 max_efgh = _mm256_max_ps(max_ef, max_gh);

//Min/max: third level
__m256 min_l3 = _mm256_min_ps(min_abcd, min_efgh);
__m256 max_l3 = _mm256_max_ps(max_abcd, max_efgh);

//Min/max: final reduction
vmin_x8 = _mm256_min_ps(vmin_x8, min_l3);
vmax_x8 = _mm256_max_ps(vmax_x8, max_l3);
}

//Horizontal reduction of vector min/max
float tmp_min[8] __attribute__((aligned(32)));
float tmp_max[8] __attribute__((aligned(32)));
_mm256_store_ps(tmp_min, vmin_x8);
_mm256_store_ps(tmp_max, vmax_x8);
for(int j=0; j<8; j++)
{
vmin = min(vmin, tmp_min[j]);
vmax = max(vmax, tmp_max[j]);
}

//Catch any stragglers
for(; i<end_rounded; i++)
{
float v = 0;
for(size_t j=0; j<filterlen; j++)
v += din->m_samples[i + j] * coefficients[j];

vmin = min(vmin, v);
vmax = max(vmax, v);

cap->m_samples[i] = v;
}
}

/**
@brief Calculates FIR coefficients
7 changes: 7 additions & 0 deletions scopeprotocols/FIRFilter.h
Original file line number Diff line number Diff line change
@@ -93,6 +93,13 @@ class FIRFilter : public Filter
float& vmin,
float& vmax);

static void DoFilterKernelAVX2(
std::vector<float>& coefficients,
AnalogWaveform* din,
AnalogWaveform* cap,
float& vmin,
float& vmax);

float m_min;
float m_max;
float m_range;