Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 45cb126e0250
Choose a base ref
...
head repository: ngscopeclient/scopehal
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: cfe08ff0ebad
Choose a head ref
  • 1 commit
  • 2 files changed
  • 1 contributor

Commits on Dec 19, 2020

  1. Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
    Copy the full SHA
    cfe08ff View commit details
Showing with 62 additions and 16 deletions.
  1. +22 −7 scopeprotocols/FIRFilter.cpp
  2. +40 −9 scopeprotocols/kernels/FIRFilter.cl
29 changes: 22 additions & 7 deletions scopeprotocols/FIRFilter.cpp
Original file line number Diff line number Diff line change
@@ -278,12 +278,24 @@ void FIRFilter::DoFilterKernelOpenCL(
size_t filterlen = coefficients.size();
size_t end = len - filterlen;

//Round size up to next multiple of block size
//(must equal BLOCK_SIZE in kernel)
const size_t blocksize = 1024;
size_t globalsize = (end + blocksize);
globalsize -= (globalsize % blocksize);

//Allocate min/max buffer (first stage reduction on GPU, rest on CPU)
size_t nblocks = globalsize / blocksize;
vector<float> minmax;
minmax.resize(2 * nblocks);

try
{
//Allocate memory and copy to the GPU
cl::Buffer inbuf(*g_clContext, din->m_samples.begin(), din->m_samples.end(), true, true, NULL);
cl::Buffer coeffbuf(*g_clContext, coefficients.begin(), coefficients.end(), true, true, NULL);
cl::Buffer outbuf(*g_clContext, cap->m_samples.begin(), cap->m_samples.end(), false, true, NULL);
cl::Buffer minmaxbuf(*g_clContext, minmax.begin(), minmax.end(), false, true, NULL);

//Run the filter
cl::CommandQueue queue(*g_clContext, g_contextDevices[0], 0);
@@ -292,25 +304,28 @@ void FIRFilter::DoFilterKernelOpenCL(
m_kernel->setArg(2, outbuf);
m_kernel->setArg(3, filterlen);
m_kernel->setArg(4, end);
queue.enqueueNDRangeKernel(*m_kernel, cl::NullRange, cl::NDRange(end, 1), cl::NullRange, NULL);
m_kernel->setArg(5, minmaxbuf);
queue.enqueueNDRangeKernel(
*m_kernel, cl::NullRange, cl::NDRange(globalsize, 1), cl::NDRange(blocksize, 1), NULL);

//Map/unmap the buffer to synchronize output with the CPU
void* ptr = queue.enqueueMapBuffer(outbuf, true, CL_MAP_READ, 0, end * sizeof(float));
void* ptr2 = queue.enqueueMapBuffer(minmaxbuf, true, CL_MAP_READ, 0, 2 * nblocks * sizeof(float));
queue.enqueueUnmapMemObject(outbuf, ptr);
queue.enqueueUnmapMemObject(minmaxbuf, ptr2);
}
catch(const cl::Error& e)
{
LogError("OpenCL error: %s (%d)\n", e.what(), e.err() );
LogFatal("OpenCL error: %s (%d)\n", e.what(), e.err() );
}

//Final reduction CPU-side for now
//Final reduction stage CPU-side
vmin = FLT_MAX;
vmax = -FLT_MAX;
for(size_t i=0; i<end; i++)
for(size_t i=0; i<nblocks; i++)
{
float v = cap->m_samples[i];
vmin = min(vmin, v);
vmax = max(vmax, v);
vmin = min(vmin, minmax[i*2]);
vmax = max(vmax, minmax[i*2 + 1]);
}
}
#endif
49 changes: 40 additions & 9 deletions scopeprotocols/kernels/FIRFilter.cl
Original file line number Diff line number Diff line change
@@ -27,23 +27,54 @@
* *
***********************************************************************************************************************/

//local size must equal this
#define BLOCK_SIZE 1024

__kernel void FIRFilter(
__global const float* din,
__constant const float* coefficients,
__global float* dout,
unsigned long filterlen,
unsigned long end
unsigned long end,
__global float* minmaxbuf
)
{
//Make sure we're actually in the block
__local float temp[BLOCK_SIZE];
unsigned long i = get_global_id(0);
if(i > end)
return;
unsigned long nblock = i / BLOCK_SIZE;

//Make sure we're actually in the block before executing
if(i < end)
{
//FIR filter core
float v = 0;
for(unsigned long j=0; j<filterlen; j++)
v += din[i+j] * coefficients[j];

//Save in shared memory for the reduction, then global memory for output
temp[get_local_id(0)] = v;
dout[i] = v;
}

//Min/max reduction in first thread of the block
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0) == 0)
{
float vmin = FLT_MAX;
float vmax = -FLT_MAX;

for(unsigned long j=0; j<BLOCK_SIZE; j++)
{
unsigned long off = i+j;
if(off > end)
break;

//FIR reduction
float v = 0;
for(unsigned long j=0; j<filterlen; j++)
v += din[i+j] * coefficients[j];
float f = temp[j];
vmin = min(vmin, f);
vmax = max(vmax, f);
}

dout[i] = v;
minmaxbuf[nblock*2 + 0] = vmin;
minmaxbuf[nblock*2 + 1] = vmax;
}
}