Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
base: b790bfb8f053
Choose a base ref
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
compare: 5be2100fff33
Choose a head ref
  • 1 commit
  • 2 files changed
  • 1 contributor

Commits on Aug 15, 2020

  1. Verified

    This commit was created on and signed with GitHub’s verified signature. The key has expired.
    Copy the full SHA
    5be2100 View commit details
Showing with 93 additions and 1 deletion.
  1. +90 −1 scopeprotocols/FFTDecoder.cpp
  2. +3 −0 scopeprotocols/FFTDecoder.h
91 changes: 90 additions & 1 deletion scopeprotocols/FFTDecoder.cpp
Original file line number Diff line number Diff line change
@@ -30,6 +30,7 @@
#include "../scopehal/scopehal.h"
#include "../scopehal/AlignedAllocator.h"
#include "FFTDecoder.h"
#include <immintrin.h>

using namespace std;

@@ -179,6 +180,19 @@ void FFTDecoder::Refresh()

//Normalize magnitudes
NormalizeOutputAVX2(cap, nouts, npoints);
NormalizeOutput(cap, nouts, npoints);


@brief Normalize FFT output (unoptimized C++ implementation)
void FFTDecoder::NormalizeOutput(AnalogWaveform* cap, size_t nouts, size_t npoints)
for(size_t i=0; i<nouts; i++)
cap->m_offsets[i] = i;
@@ -189,6 +203,81 @@ void FFTDecoder::Refresh()

cap->m_samples[i] = sqrtf(real*real + imag*imag) / npoints;

@brief Normalize FFT output (optimized AVX2 implementation)
void FFTDecoder::NormalizeOutputAVX2(AnalogWaveform* cap, size_t nouts, size_t npoints)
int64_t* offs = (int64_t*)&cap->m_offsets[0];
int64_t* durs = (int64_t*)&cap->m_durations[0];

size_t end = nouts - (nouts % 8);

int64_t __attribute__ ((aligned(32))) ones_x4[] = {1, 1, 1, 1};
int64_t __attribute__ ((aligned(32))) fours_x4[] = {4, 4, 4, 4};
int64_t __attribute__ ((aligned(32))) count_x4[] = {0, 1, 2, 3};

__m256i all_ones = _mm256_load_si256(reinterpret_cast<__m256i*>(ones_x4));
__m256i all_fours = _mm256_load_si256(reinterpret_cast<__m256i*>(fours_x4));
__m256i counts = _mm256_load_si256(reinterpret_cast<__m256i*>(count_x4));

float norm = 1.0f / npoints;
__m256 norm_f = { norm, norm, norm, norm, norm, norm, norm, norm };

float* pout = (float*)&cap->m_samples[0];

//Vectorized processing (8 samples per iteration)
for(size_t k=0; k<end; k += 8)
//Fill duration
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 4), all_ones);

//Fill offset
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 4), counts);
counts = _mm256_add_epi64(counts, all_fours);

//Read interleaved real/imaginary FFT output (riririri riririri)
__m256 din0 = _mm256_load_ps(m_rdout + k*2);
__m256 din1 = _mm256_load_ps(m_rdout + k*2 + 8);

//Step 1: Shuffle 32-bit values within 128-bit lanes to get rriirrii rriirrii.
din0 = _mm256_permute_ps(din0, 0xd8);
din1 = _mm256_permute_ps(din1, 0xd8);

//Step 2: Shuffle 64-bit values to get rrrriiii rrrriiii.
__m256i block0 = _mm256_permute4x64_epi64(_mm256_castps_si256(din0), 0xd8);
__m256i block1 = _mm256_permute4x64_epi64(_mm256_castps_si256(din1), 0xd8);

//Step 3: Shuffle 128-bit values to get rrrrrrrr iiiiiiii.
__m256 real = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x20));
__m256 imag = _mm256_castsi256_ps(_mm256_permute2x128_si256(block0, block1, 0x31));

//Actual vector normalization
real = _mm256_mul_ps(real, real);
imag = _mm256_mul_ps(imag, imag);
__m256 sum = _mm256_add_ps(real, imag);
__m256 mag = _mm256_sqrt_ps(sum);
mag = _mm256_mul_ps(mag, norm_f);

_mm256_store_ps(pout + k, mag);

//Get any extras we didn't get in the SIMD loop
for(size_t k=end; k<nouts; k++)
cap->m_offsets[k] = k;
cap->m_durations[k] = 1;

float real = m_rdout[k*2];
float imag = m_rdout[k*2 + 1];

cap->m_samples[k] = sqrtf(real*real + imag*imag) / npoints;
3 changes: 3 additions & 0 deletions scopeprotocols/FFTDecoder.h
Original file line number Diff line number Diff line change
@@ -59,6 +59,9 @@ class FFTDecoder : public ProtocolDecoder

void NormalizeOutput(AnalogWaveform* cap, size_t nouts, size_t npoints);
void NormalizeOutputAVX2(AnalogWaveform* cap, size_t nouts, size_t npoints);

size_t m_cachedNumPoints;
float* m_rdin;
float* m_rdout;