ngscopeclient · Aug 7, 2020 · Aug 8, 2020 · Aug 8, 2020 · Aug 8, 2020
Showing with 131 additions and 74 deletions.

+3 −1 src/glscopeclient/WaveformArea.h

+104 −61 src/glscopeclient/WaveformArea_rendering.cpp

+2 −0 src/glscopeclient/glscopeclient.h

+11 −4 src/glscopeclient/main.cpp

+11 −8 src/glscopeclient/shaders/waveform-compute.glsl
diff --git a/src/glscopeclient/WaveformArea.h b/src/glscopeclient/WaveformArea.h
@@ -96,7 +96,8 @@ class WaveformRenderData
 	bool					m_geometryOK;
 
 	//SSBOs with waveform data
-	ShaderStorageBuffer		m_waveformStorageBuffer;
+	ShaderStorageBuffer		m_waveformXBuffer;
+	ShaderStorageBuffer		m_waveformYBuffer;
 	ShaderStorageBuffer		m_waveformConfigBuffer;
 	ShaderStorageBuffer		m_waveformIndexBuffer;
 
@@ -373,6 +374,7 @@ class WaveformArea : public Gtk::GLArea
 	float XAxisUnitsToPixels(int64_t t);
 	float XAxisUnitsToXPosition(int64_t t);
 	float PickStepSize(float volts_per_half_span, int min_steps = 2, int max_steps = 5);
+	size_t BinarySearchForGequal(float* buf, size_t len, float value);
 
 	void OnRemoveOverlay(ProtocolDecoder* decode);
 

diff --git a/src/glscopeclient/WaveformArea_rendering.cpp b/src/glscopeclient/WaveformArea_rendering.cpp
@@ -77,9 +77,7 @@ void WaveformArea::PrepareGeometry(WaveformRenderData* wdata)
 		return;
 	}
 
-	double xscale = pdat->m_timescale * m_group->m_pixelsPerXUnit;
-	float xoff = (pdat->m_triggerPhase - m_group->m_xAxisOffset) * m_group->m_pixelsPerXUnit;
-
+	float xscale = pdat->m_timescale * m_group->m_pixelsPerXUnit;
 	bool fft = IsFFT();
 
 	//Zero voltage level
@@ -96,64 +94,72 @@ void WaveformArea::PrepareGeometry(WaveformRenderData* wdata)
 			ybase = m_height - (m_overlayPositions[dynamic_cast<ProtocolDecoder*>(channel)] + 10);
 	}
 
-	//Calculate X/Y coordinate of each sample point
-	//TODO: some of this can probably move to GPU too?
-	vector<EmptyConstructorWrapper<float>> traceBuffer;
-	vector<EmptyConstructorWrapper<uint32_t>> indexBuffer;
-	double offset = channel->GetOffset();
+	float offset = channel->GetOffset();
+
+	//Y axis scaling in shader
+	float yoff = 0;
+	float yscale = 1;
 
+	//We need to stretch every sample to two samples, one at the very left and one at the very right,
+	//so interpolation works right.
+	//TODO: we can probably avoid this by rewriting the compute shader to not interpolate like this
+	//TODO: only add extra samples if the left and right values are not the same?
+	size_t realcount = count;
 	if(digdat)
-	{
-		//We need to stretch every sample to two samples, one at the very left and one at the very right,
-		//so interpolation works right.
-		//TODO: we can probably avoid this by rewriting the compute shader to not interpolate like this
-		//TODO: only add extra samples if the left and right values are not the same
-		size_t realcount = count;
 		count *= 2;
 
+	float* xBuffer = reinterpret_cast<float*>(aligned_alloc(32, count*sizeof(float)));
+	float* yBuffer = NULL;
+	bool needToFreeYBuffer = true;
+	uint32_t* indexBuffer = reinterpret_cast<uint32_t*>(aligned_alloc(32, m_width*sizeof(uint32_t)));
+
+	if(digdat)
+	{
 		float digheight;
 		if(channel == m_channel)
 			digheight = m_height - 5;
 		else
 			digheight = 20;
 
-		traceBuffer.resize(count*2);
-		indexBuffer.resize(m_width);
+		yBuffer = reinterpret_cast<float*>(aligned_alloc(32, count*sizeof(float)));
 
 		//#pragma omp parallel for
+		yoff = ybase;
+		yscale = digheight;
 		for(size_t j=0; j<realcount; j++)
 		{
 			int64_t off = digdat->m_offsets[j];
-			traceBuffer[j*4] = off * xscale + xoff;
-			traceBuffer[j*4 + 2] = (off + digdat->m_durations[j]) * xscale + xoff - 1;
+			xBuffer[j*2] 		= off;
+			xBuffer[j*2 + 1]	= off + digdat->m_durations[j];
 
-			float y = ybase + ( digdat->m_samples[j] ? digheight: 0 );
-			traceBuffer[j*4 + 1] = y;
-			traceBuffer[j*4 + 3] = y;
+			yBuffer[j*2] 		= digdat->m_samples[j];
+			yBuffer[j*2 + 1]	= digdat->m_samples[j];
 		}
 	}
 	else
 	{
-		traceBuffer.resize(count*2);
-		indexBuffer.resize(m_width);
+		float* psamps = reinterpret_cast<float*>(__builtin_assume_aligned(&andat->m_samples[0], 16));
+		float* pdst = reinterpret_cast<float*>(__builtin_assume_aligned(xBuffer, 32));
+		int64_t* psrc = reinterpret_cast<int64_t*>(__builtin_assume_aligned(&andat->m_offsets[0], 16));
+
+		//Not possible to push this to a compute shader without GL_ARB_gpu_shader_int64,
+		//which isn't well supported on integrated gfx yet :(
+		for(size_t j=0; j < count; j++)
+			pdst[j] 	= psrc[j];
 
-		//TODO: can we push this to a compute shader?
-		//This doesn't look too SIMD-friendly because the inputs aren't all flops
 		if(fft)
 		{
+			yBuffer = reinterpret_cast<float*>(aligned_alloc(32, count*sizeof(float)));
+			yscale = 1;
 			for(size_t j=0; j<count; j++)
-			{
-				traceBuffer[j*2] 		= andat->m_offsets[j] * xscale + xoff;
-				traceBuffer[j*2 + 1]	= DbToYPosition(-70 - (20 * log10(andat->m_samples[j])));	//TODO: don't hard code plot limits
-			}
+				yBuffer[j]	= DbToYPosition(-70 - (20 * log10(psamps[j])));	//TODO: don't hard code plot limits
 		}
 		else
 		{
-			for(size_t j=0; j<count; j++)
-			{
-				traceBuffer[j*2] 		= andat->m_offsets[j] * xscale + xoff;
-				traceBuffer[j*2 + 1]	= (m_pixelsPerVolt * (andat->m_samples[j] + offset)) + ybase;
-			}
+			yoff = ybase;
+			yscale = m_pixelsPerVolt;
+			needToFreeYBuffer = false;
+			yBuffer = psamps;
 		}
 	}
 
@@ -165,56 +171,92 @@ void WaveformArea::PrepareGeometry(WaveformRenderData* wdata)
 	//This is necessary since samples may be sparse and have arbitrary spacing between them, so we can't
 	//trivially map sample indexes to X pixel coordinates.
 	//TODO: can we parallelize this? move to a compute shader?
-	size_t nsample = 0;
+	float xoff = (pdat->m_triggerPhase - m_group->m_xAxisOffset) * m_group->m_pixelsPerXUnit;
 	for(int j=0; j<m_width; j++)
-	{
-		bool hit = false;
-
-		//Move forward until we find a sample that starts in the current column
-		for(; nsample < count-1; nsample ++)
-		{
-			//If the next sample ends after the start of the current pixel. stop
-			float end = traceBuffer[(nsample+1)*2];
-			if(end >= j)
-			{
-				//Start the current column at this sample
-				indexBuffer[j] = nsample;
-				hit = true;
-				break;
-			}
-		}
-
-		//Default to drawing nothing
-		if(!hit)
-			indexBuffer[j] = count;
-	}
+		indexBuffer[j] = BinarySearchForGequal(xBuffer, count, (j - xoff) / xscale);
 
 	dt = GetTime() - start;
 	m_indexTime += dt;
 	start = GetTime();
 
 	//Download it
-	wdata->m_waveformStorageBuffer.Bind();
-	glBufferData(GL_SHADER_STORAGE_BUFFER, traceBuffer.size()*sizeof(float), &traceBuffer[0], GL_STREAM_DRAW);
+	wdata->m_waveformXBuffer.Bind();
+	glBufferData(GL_SHADER_STORAGE_BUFFER, count*sizeof(float), xBuffer, GL_STREAM_DRAW);
+	wdata->m_waveformYBuffer.Bind();
+	glBufferData(GL_SHADER_STORAGE_BUFFER, count*sizeof(float), yBuffer, GL_STREAM_DRAW);
 
 	//Config stuff
-	uint32_t config[5];
+	uint32_t config[10];
+	float* fconfig = reinterpret_cast<float*>(config);
 	config[0] = m_height;							//windowHeight
 	config[1] = m_plotRight;						//windowWidth
 	config[2] = count;								//depth
 	config[3] = m_parent->GetTraceAlpha() * 256;	//alpha
 	config[4] = digdat ? 1 : 0;						//digital
+	fconfig[5] = xoff;								//xoff
+	fconfig[6] = xscale;							//xscale
+	fconfig[7] = yoff;								//ybase
+	fconfig[8] = yscale;							//yscale
+	fconfig[9] = offset;							//yoff
 	wdata->m_waveformConfigBuffer.Bind();
 	glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(config), config, GL_STREAM_DRAW);
 
 	//Indexing
 	wdata->m_waveformIndexBuffer.Bind();
-	glBufferData(GL_SHADER_STORAGE_BUFFER, indexBuffer.size()*sizeof(uint32_t), &indexBuffer[0], GL_STREAM_DRAW);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, m_width*sizeof(uint32_t), indexBuffer, GL_STREAM_DRAW);
 
 	dt = GetTime() - start;
 	m_downloadTime += dt;
 
 	wdata->m_geometryOK = true;
+
+	free(xBuffer);
+	if(needToFreeYBuffer)
+		free(yBuffer);
+	free(indexBuffer);
+}
+
+/**
+	@brief Look for a value greater than or equal to "value" in buf and return the index
+ */
+size_t WaveformArea::BinarySearchForGequal(float* buf, size_t len, float value)
+{
+	size_t pos = len/2;
+	size_t last_lo = 0;
+	size_t last_hi = len-1;
+
+	//Clip if out of range
+	if(buf[0] >= value)
+		return 0;
+	if(buf[last_hi] < value)
+		return len;
+
+	while(true)
+	{
+		LogIndenter li;
+
+		//Stop if we've bracketed the target
+		if( (last_hi - last_lo) <= 1)
+			break;
+
+		//Move down
+		if(buf[pos] > value)
+		{
+			size_t delta = pos - last_lo;
+			last_hi = pos;
+			pos = last_lo + delta/2;
+		}
+
+		//Move up
+		else
+		{
+			size_t delta = last_hi - pos;
+			last_lo = pos;
+			pos = last_hi - delta/2;
+		}
+	}
+
+	return last_lo;
 }
 
 void WaveformArea::ResetTextureFiltering()
@@ -460,7 +502,8 @@ void WaveformArea::RenderTrace(WaveformRenderData* data)
 
 	m_waveformComputeProgram.Bind();
 	m_waveformComputeProgram.SetImageUniform(data->m_waveformTexture, "outputTex");
-	data->m_waveformStorageBuffer.BindBase(1);
+	data->m_waveformXBuffer.BindBase(1);
+	data->m_waveformYBuffer.BindBase(4);
 	data->m_waveformConfigBuffer.BindBase(2);
 	data->m_waveformIndexBuffer.BindBase(3);
 	m_waveformComputeProgram.DispatchCompute(numGroups, 1, 1);

diff --git a/src/glscopeclient/glscopeclient.h b/src/glscopeclient/glscopeclient.h
@@ -75,4 +75,6 @@
 
 double GetTime();
 
+extern bool g_hasAvx512F;
+
 #endif
diff --git a/src/glscopeclient/main.cpp b/src/glscopeclient/main.cpp
@@ -32,7 +32,7 @@
 	@author Andrew D. Zonenberg
 	@brief Program entry point
  */
- 
+
 #ifdef _WIN32
 #include <windows.h>
 #include <shlwapi.h>
@@ -54,6 +54,8 @@ int g_numDecodes = 0;
 
 ScopeApp* g_app = NULL;
 
+bool g_hasAvx512F = false;
+
 void help();
 
 void help()
@@ -159,25 +161,30 @@ int main(int argc, char* argv[])
 	//Set up logging
 	g_log_sinks.emplace(g_log_sinks.begin(), new ColoredSTDLogSink(console_verbosity));
 
+	//Check CPU features
+	g_hasAvx512F = __builtin_cpu_supports("avx512f");
+	if(g_hasAvx512F)
+		LogDebug("CPU supports AVX512F\n");
+
 	//Change to the binary's directory so we can use relative paths for external resources
 	//FIXME: portability warning: this only works on Linux
 #ifdef _WIN32
 	// Retrieve the file name of the current process image
 	TCHAR binPath[MAX_PATH];
-	
+
 	if( GetModuleFileName(NULL, binPath, MAX_PATH) == 0 )
 	{
 		LogError("Error: GetModuleFileName() failed.\n");
 		return 1;
 	}
-	
+
 	// Remove file name from path
 	if( !PathRemoveFileSpec(binPath) )
 	{
 		LogError("Error: PathRemoveFileSpec() failed.\n");
 		return 1;
 	}
-	
+
 	// Set it as current working directory
 	if( SetCurrentDirectory(binPath) == 0 )
 	{

diff --git a/src/glscopeclient/shaders/waveform-compute.glsl b/src/glscopeclient/shaders/waveform-compute.glsl
@@ -3,16 +3,14 @@
 //The output texture (for now, only alpha channel is used)
 layout(binding=0, rgba32f) uniform image2D outputTex;
 
-//Voltage data
-struct data_point
+layout(std430, binding=1) buffer waveform_x
 {
-	float x;		//x pixel position (fractional)
-	float voltage;	//y value of this sample, in pixels
+	float xpos[];		//x pixel position (fractional)
 };
 
-layout(std430, binding=1) buffer waveform
+layout(std430, binding=4) buffer waveform_y
 {
-	data_point data[];
+	float voltage[];	//y value of this sample, in pixels
 };
 
 //Global configuration for the run
@@ -23,6 +21,11 @@ layout(std430, binding=2) buffer config
 	uint memDepth;
 	uint alpha_scaled;
 	uint digital;
+	float xoff;
+	float xscale;
+	float ybase;
+	float yscale;
+	float yoff;
 };
 
 //Indexes so we know which samples go to which X pixel range
@@ -81,12 +84,12 @@ void main()
 
 	//Loop over the waveform, starting at the leftmost point that overlaps this column
 	uint istart = xind[gl_GlobalInvocationID.x];
-	vec2 left = vec2(data[istart].x, data[istart].voltage);
+	vec2 left = vec2(xpos[istart]*xscale + xoff, (voltage[istart] + yoff)*yscale + ybase);
 	vec2 right;
 	for(uint i=istart; i<(memDepth-1); i++)
 	{
 		//Fetch coordinates of the current and upcoming sample
-		right = vec2(data[i+1].x, data[i+1].voltage);
+		right = vec2(xpos[i+1]*xscale + xoff, (voltage[i+1] + yoff)*yscale + ybase);
 
 		//If the current point is right of us, stop
 		if(left.x > gl_GlobalInvocationID.x + 1)
Original file line number	Diff line number	Diff line change
		@@ -75,4 +75,6 @@

		double GetTime();

		extern bool g_hasAvx512F;

		#endif