ngscopeclient · Dec 19, 2020
Showing with 62 additions and 16 deletions.

+22 −7 scopeprotocols/FIRFilter.cpp

+40 −9 scopeprotocols/kernels/FIRFilter.cl
diff --git a/scopeprotocols/FIRFilter.cpp b/scopeprotocols/FIRFilter.cpp
@@ -278,12 +278,24 @@ void FIRFilter::DoFilterKernelOpenCL(
 	size_t filterlen = coefficients.size();
 	size_t end = len - filterlen;
 
+	//Round size up to next multiple of block size
+	//(must equal BLOCK_SIZE in kernel)
+	const size_t blocksize = 1024;
+	size_t globalsize = (end + blocksize);
+	globalsize -= (globalsize % blocksize);
+
+	//Allocate min/max buffer (first stage reduction on GPU, rest on CPU)
+	size_t nblocks = globalsize / blocksize;
+	vector<float> minmax;
+	minmax.resize(2 * nblocks);
+
 	try
 	{
 		//Allocate memory and copy to the GPU
 		cl::Buffer inbuf(*g_clContext, din->m_samples.begin(), din->m_samples.end(), true, true, NULL);
 		cl::Buffer coeffbuf(*g_clContext, coefficients.begin(), coefficients.end(), true, true, NULL);
 		cl::Buffer outbuf(*g_clContext, cap->m_samples.begin(), cap->m_samples.end(), false, true, NULL);
+		cl::Buffer minmaxbuf(*g_clContext, minmax.begin(), minmax.end(), false, true, NULL);
 
 		//Run the filter
 		cl::CommandQueue queue(*g_clContext, g_contextDevices[0], 0);
@@ -292,25 +304,28 @@ void FIRFilter::DoFilterKernelOpenCL(
 		m_kernel->setArg(2, outbuf);
 		m_kernel->setArg(3, filterlen);
 		m_kernel->setArg(4, end);
-		queue.enqueueNDRangeKernel(*m_kernel, cl::NullRange, cl::NDRange(end, 1), cl::NullRange, NULL);
+		m_kernel->setArg(5, minmaxbuf);
+		queue.enqueueNDRangeKernel(
+			*m_kernel, cl::NullRange, cl::NDRange(globalsize, 1), cl::NDRange(blocksize, 1), NULL);
 
 		//Map/unmap the buffer to synchronize output with the CPU
 		void* ptr = queue.enqueueMapBuffer(outbuf, true, CL_MAP_READ, 0, end * sizeof(float));
+		void* ptr2 = queue.enqueueMapBuffer(minmaxbuf, true, CL_MAP_READ, 0, 2 * nblocks * sizeof(float));
 		queue.enqueueUnmapMemObject(outbuf, ptr);
+		queue.enqueueUnmapMemObject(minmaxbuf, ptr2);
 	}
 	catch(const cl::Error& e)
 	{
-		LogError("OpenCL error: %s (%d)\n", e.what(), e.err() );
+		LogFatal("OpenCL error: %s (%d)\n", e.what(), e.err() );
 	}
 
-	//Final reduction CPU-side for now
+	//Final reduction stage CPU-side
 	vmin = FLT_MAX;
 	vmax = -FLT_MAX;
-	for(size_t i=0; i<end; i++)
+	for(size_t i=0; i<nblocks; i++)
 	{
-		float v = cap->m_samples[i];
-		vmin = min(vmin, v);
-		vmax = max(vmax, v);
+		vmin = min(vmin, minmax[i*2]);
+		vmax = max(vmax, minmax[i*2 + 1]);
 	}
 }
 #endif

diff --git a/scopeprotocols/kernels/FIRFilter.cl b/scopeprotocols/kernels/FIRFilter.cl
@@ -27,23 +27,54 @@
 *                                                                                                                      *
 ***********************************************************************************************************************/
 
+//local size must equal this
+#define BLOCK_SIZE 1024
+
 __kernel void FIRFilter(
 	__global const float* din,
 	__constant const float* coefficients,
 	__global float* dout,
 	unsigned long filterlen,
-	unsigned long end
+	unsigned long end,
+	__global float* minmaxbuf
 	)
 {
-	//Make sure we're actually in the block
+	__local float temp[BLOCK_SIZE];
 	unsigned long i = get_global_id(0);
-	if(i > end)
-		return;
+	unsigned long nblock = i / BLOCK_SIZE;
+
+	//Make sure we're actually in the block before executing
+	if(i < end)
+	{
+		//FIR filter core
+		float v = 0;
+		for(unsigned long j=0; j<filterlen; j++)
+			v += din[i+j] * coefficients[j];
+
+		//Save in shared memory for the reduction, then global memory for output
+		temp[get_local_id(0)] = v;
+		dout[i] = v;
+	}
+
+	//Min/max reduction in first thread of the block
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if(get_local_id(0) == 0)
+	{
+		float vmin = FLT_MAX;
+		float vmax = -FLT_MAX;
+
+		for(unsigned long j=0; j<BLOCK_SIZE; j++)
+		{
+			unsigned long off = i+j;
+			if(off > end)
+				break;
 
-	//FIR reduction
-	float v = 0;
-	for(unsigned long j=0; j<filterlen; j++)
-		v += din[i+j] * coefficients[j];
+			float f = temp[j];
+			vmin = min(vmin, f);
+			vmax = max(vmax, f);
+		}
 
-	dout[i] = v;
+		minmaxbuf[nblock*2 + 0] = vmin;
+		minmaxbuf[nblock*2 + 1] = vmax;
+	}
 }