Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 939873a948cb
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: b0a831b700d2
Choose a head ref
  • 1 commit
  • 1 file changed
  • 1 contributor

Commits on May 31, 2021

  1. Copy the full SHA
    b0a831b View commit details
Showing with 31 additions and 6 deletions.
  1. +31 −6 scopehal/Oscilloscope.cpp
37 changes: 31 additions & 6 deletions scopehal/Oscilloscope.cpp
Original file line number Diff line number Diff line change
@@ -949,17 +949,22 @@ void Oscilloscope::Convert16BitSamplesAVX2(
__m256 gains = { gain, gain, gain, gain, gain, gain, gain, gain };
__m256 offsets = { offset, offset, offset, offset, offset, offset, offset, offset };

for(size_t k=0; k<end; k += 16)
for(size_t k=0; k<end; k += 32)
{
//Load all 16 raw ADC samples, without assuming alignment
//Load all 32 raw ADC samples, without assuming alignment
//(on most modern Intel processors, load and loadu have same latency/throughput)
__m256i raw_samples = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k));
__m256i raw_samples1 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k));
__m256i raw_samples2 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(pin + k + 16));

//Fill duration
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 4), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 8), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 12), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 16), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 20), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 24), all_ones);
_mm256_store_si256(reinterpret_cast<__m256i*>(durs + k + 28), all_ones);

//Fill offset
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k), counts);
@@ -970,30 +975,50 @@ void Oscilloscope::Convert16BitSamplesAVX2(
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 12), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 16), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 20), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 24), counts);
counts = _mm256_add_epi64(counts, all_fours);
_mm256_store_si256(reinterpret_cast<__m256i*>(offs + k + 28), counts);
counts = _mm256_add_epi64(counts, all_fours);

//Extract the low and high halves (8 samples each) from the input block
__m128i block0_i16 = _mm256_extracti128_si256(raw_samples, 0);
__m128i block1_i16 = _mm256_extracti128_si256(raw_samples, 1);
//Extract the low and high halves (8 samples each) from the input blocks
__m128i block0_i16 = _mm256_extracti128_si256(raw_samples1, 0);
__m128i block1_i16 = _mm256_extracti128_si256(raw_samples1, 1);
__m128i block2_i16 = _mm256_extracti128_si256(raw_samples2, 0);
__m128i block3_i16 = _mm256_extracti128_si256(raw_samples2, 1);

//Convert both blocks from 16 to 32 bit, giving us a pair of 8x int32 vectors
__m256i block0_i32 = _mm256_cvtepi16_epi32(block0_i16);
__m256i block1_i32 = _mm256_cvtepi16_epi32(block1_i16);
__m256i block2_i32 = _mm256_cvtepi16_epi32(block2_i16);
__m256i block3_i32 = _mm256_cvtepi16_epi32(block3_i16);

//Convert the 32-bit int blocks to fp32
//Sadly there's no direct epi32 to ps conversion instruction.
__m256 block0_float = _mm256_cvtepi32_ps(block0_i32);
__m256 block1_float = _mm256_cvtepi32_ps(block1_i32);
__m256 block2_float = _mm256_cvtepi32_ps(block2_i32);
__m256 block3_float = _mm256_cvtepi32_ps(block3_i32);

//Woo! We've finally got floating point data. Now we can do the fun part.
block0_float = _mm256_mul_ps(block0_float, gains);
block1_float = _mm256_mul_ps(block1_float, gains);
block2_float = _mm256_mul_ps(block2_float, gains);
block3_float = _mm256_mul_ps(block3_float, gains);

block0_float = _mm256_sub_ps(block0_float, offsets);
block1_float = _mm256_sub_ps(block1_float, offsets);
block2_float = _mm256_sub_ps(block2_float, offsets);
block3_float = _mm256_sub_ps(block3_float, offsets);

//All done, store back to the output buffer
_mm256_store_ps(pout + k, block0_float);
_mm256_store_ps(pout + k + 8, block1_float);
_mm256_store_ps(pout + k + 16, block2_float);
_mm256_store_ps(pout + k + 24, block3_float);
}

//Get any extras we didn't get in the SIMD loop