ngscopeclient · Sep 5, 2020 · Sep 5, 2020
Showing with 30 additions and 61 deletions.

+2 −3 src/glscopeclient/WaveformArea.h

+22 −54 src/glscopeclient/WaveformArea_rendering.cpp

+6 −4 src/glscopeclient/shaders/waveform-compute.glsl
diff --git a/src/glscopeclient/WaveformArea.h b/src/glscopeclient/WaveformArea.h
@@ -109,10 +109,11 @@ class WaveformRenderData
 	size_t					m_count;
 
 	//OpenGL-mapped buffers for the data
-	float*					m_mappedXBuffer;
+	int64_t*				m_mappedXBuffer;
 	float*					m_mappedYBuffer;
 	uint32_t*				m_mappedIndexBuffer;
 	uint32_t*				m_mappedConfigBuffer;
+	int64_t*				m_mappedConfigBuffer64;
 	float*					m_mappedFloatConfigBuffer;
 
 	//Map all buffers for download
@@ -403,8 +404,6 @@ class WaveformArea : public Gtk::GLArea
 	float PickStepSize(float volts_per_half_span, int min_steps = 2, int max_steps = 5);
 	template<class T> size_t BinarySearchForGequal(T* buf, size_t len, T value);
 	float GetValueAtTime(int64_t time_ps);
-	void Int64ToFloat(float* dst, int64_t* src, size_t len);
-	void Int64ToFloatAVX512(float* dst, int64_t* src, size_t len);
 
 	void OnRemoveOverlay(StreamDescriptor filter);
 

diff --git a/src/glscopeclient/WaveformArea_rendering.cpp b/src/glscopeclient/WaveformArea_rendering.cpp
@@ -68,11 +68,12 @@ void WaveformRenderData::MapBuffers(size_t width)
 			m_count *= 2;
 	}
 
-	m_mappedXBuffer = (float*)m_waveformXBuffer.Map(m_count*sizeof(float), GL_READ_WRITE);
+	m_mappedXBuffer = (int64_t*)m_waveformXBuffer.Map(m_count*sizeof(int64_t), GL_READ_WRITE);
 	m_mappedYBuffer = (float*)m_waveformYBuffer.Map(m_count*sizeof(float));
 	m_mappedIndexBuffer = (uint32_t*)m_waveformIndexBuffer.Map(width*sizeof(uint32_t));
-	m_mappedConfigBuffer = (uint32_t*)m_waveformConfigBuffer.Map(sizeof(float)*10);
+	m_mappedConfigBuffer = (uint32_t*)m_waveformConfigBuffer.Map(sizeof(float)*12);
 	m_mappedFloatConfigBuffer = (float*)m_mappedConfigBuffer;
+	m_mappedConfigBuffer64 = (int64_t*)m_mappedConfigBuffer;
 }
 
 void WaveformRenderData::UnmapBuffers()
@@ -165,16 +166,10 @@ void WaveformArea::PrepareGeometry(WaveformRenderData* wdata)
 	}
 	else
 	{
-		//Need AVX512DQ or AVX512VL for VCTVQQ2PS
-		//TODO: see if there is any way to speed this up at least a little on AVX2?
-		if(g_hasAvx512DQ || g_hasAvx512VL)
-			Int64ToFloatAVX512(wdata->m_mappedXBuffer, reinterpret_cast<int64_t*>(&andat->m_offsets[0]), wdata->m_count);
-		else
-			Int64ToFloat(wdata->m_mappedXBuffer, reinterpret_cast<int64_t*>(&andat->m_offsets[0]), wdata->m_count);
-
 		yscale = m_pixelsPerVolt;
 
 		//Copy the waveform
+		memcpy(wdata->m_mappedXBuffer, &andat->m_offsets[0], wdata->m_count*sizeof(int64_t));
 		memcpy(wdata->m_mappedYBuffer, &andat->m_samples[0], wdata->m_count*sizeof(float));
 	}
 
@@ -186,9 +181,14 @@ void WaveformArea::PrepareGeometry(WaveformRenderData* wdata)
 	//This is necessary since samples may be sparse and have arbitrary spacing between them, so we can't
 	//trivially map sample indexes to X pixel coordinates.
 	//TODO: can we parallelize this? move to a compute shader?
-	float xoff = (pdat->m_triggerPhase - m_group->m_xAxisOffset) * m_group->m_pixelsPerXUnit;
+	float xoff = pdat->m_triggerPhase * m_group->m_pixelsPerXUnit;
 	for(int j=0; j<m_width; j++)
-		wdata->m_mappedIndexBuffer[j] = BinarySearchForGequal(wdata->m_mappedXBuffer, wdata->m_count, (j - xoff) / xscale);
+	{
+		wdata->m_mappedIndexBuffer[j] = BinarySearchForGequal(
+			wdata->m_mappedXBuffer,
+			wdata->m_count,
+			(j /*- pdat->m_triggerPhase*/ + m_group->m_xAxisOffset) / pdat->m_timescale);
+	}
 
 	dt = GetTime() - start;
 	m_indexTime += dt;
@@ -200,54 +200,22 @@ void WaveformArea::PrepareGeometry(WaveformRenderData* wdata)
 
 	//Config stuff
 	//TODO: we should be able to only update this stuff if we pan/zoom, without redoing the waveform data itself
-	wdata->m_mappedConfigBuffer[0] = m_height;							//windowHeight
-	wdata->m_mappedConfigBuffer[1] = m_plotRight;						//windowWidth
-	wdata->m_mappedConfigBuffer[2] = wdata->m_count;					//depth
-	wdata->m_mappedFloatConfigBuffer[3] = alpha_scaled;					//alpha
-	wdata->m_mappedConfigBuffer[4] = digdat ? 1 : 0;					//digital
-	wdata->m_mappedFloatConfigBuffer[5] = xoff;							//xoff
-	wdata->m_mappedFloatConfigBuffer[6] = xscale;						//xscale
-	wdata->m_mappedFloatConfigBuffer[7] = ybase;						//ybase
-	wdata->m_mappedFloatConfigBuffer[8] = yscale;						//yscale
-	wdata->m_mappedFloatConfigBuffer[9] = yoff;							//yoff
+	wdata->m_mappedConfigBuffer64[0] = -m_group->m_xAxisOffset / pdat->m_timescale;			//innerXoff
+	wdata->m_mappedConfigBuffer[2] = m_height;							//windowHeight
+	wdata->m_mappedConfigBuffer[3] = m_plotRight;						//windowWidth
+	wdata->m_mappedConfigBuffer[4] = wdata->m_count;					//depth
+	wdata->m_mappedFloatConfigBuffer[5] = alpha_scaled;					//alpha
+	wdata->m_mappedConfigBuffer[6] = digdat ? 1 : 0;					//digital
+	wdata->m_mappedFloatConfigBuffer[7] = xoff;							//xoff
+	wdata->m_mappedFloatConfigBuffer[8] = xscale;						//xscale
+	wdata->m_mappedFloatConfigBuffer[9] = ybase;						//ybase
+	wdata->m_mappedFloatConfigBuffer[10] = yscale;						//yscale
+	wdata->m_mappedFloatConfigBuffer[11] = yoff;						//yoff
 
 	//Done
 	wdata->m_geometryOK = true;
 }
 
-/**
-	@brief Convert an array of int64_t's to floats
- */
-void WaveformArea::Int64ToFloat(float* dst, int64_t* src, size_t len)
-{
-	float* pdst = reinterpret_cast<float*>(__builtin_assume_aligned(dst, 32));
-	int64_t* psrc = reinterpret_cast<int64_t*>(__builtin_assume_aligned(src, 16));
-
-	//Not possible to push this to a compute shader without GL_ARB_gpu_shader_int64,
-	//which isn't well supported on integrated gfx yet :(
-	for(size_t j=0; j < len; j++)
-		pdst[j] 	= psrc[j];
-}
-
-__attribute__((target("avx512dq")))
-void WaveformArea::Int64ToFloatAVX512(float* dst, int64_t* src, size_t len)
-{
-	//Round length down to multiple of 8 so we can SIMD the loop
-	size_t len_rounded = len - (len % 8);
-
-	//Main unrolled loop
-	for(size_t j=0; j<len_rounded; j+= 8)
-	{
-		__m512i i64x8 = _mm512_load_epi64(src + j);
-		__m256 f32x8 = _mm512_cvt_roundepi64_ps(i64x8, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-		_mm256_store_ps(dst + j, f32x8);
-	}
-
-	//Do anything we missed
-	for(size_t j=len_rounded; j < len; j++)
-		dst[j] 	= src[j];
-}
-
 /**
 	@brief Look for a value greater than or equal to "value" in buf and return the index
  */

diff --git a/src/glscopeclient/shaders/waveform-compute.glsl b/src/glscopeclient/shaders/waveform-compute.glsl
@@ -1,11 +1,12 @@
-#version 430
+#version 450
+#extension GL_ARB_gpu_shader_int64 : require
 
 //The output texture (for now, only alpha channel is used)
 layout(binding=0, rgba32f) uniform image2D outputTex;
 
 layout(std430, binding=1) buffer waveform_x
 {
-	float xpos[];		//x pixel position (fractional)
+	int64_t xpos[];		//x position, in time ticks
 };
 
 layout(std430, binding=4) buffer waveform_y
@@ -16,6 +17,7 @@ layout(std430, binding=4) buffer waveform_y
 //Global configuration for the run
 layout(std430, binding=2) buffer config
 {
+	int64_t innerXoff;
 	uint windowHeight;
 	uint windowWidth;
 	uint memDepth;
@@ -81,12 +83,12 @@ void main()
 
 	//Loop over the waveform, starting at the leftmost point that overlaps this column
 	uint istart = xind[gl_GlobalInvocationID.x];
-	vec2 left = vec2(xpos[istart]*xscale + xoff, (voltage[istart] + yoff)*yscale + ybase);
+	vec2 left = vec2(float(xpos[istart] + innerXoff) * xscale + xoff, (voltage[istart] + yoff)*yscale + ybase);
 	vec2 right;
 	for(uint i=istart; i<(memDepth-1); i++)
 	{
 		//Fetch coordinates of the current and upcoming sample
-		right = vec2(xpos[i+1]*xscale + xoff, (voltage[i+1] + yoff)*yscale + ybase);
+		right = vec2(float(xpos[i+1] + innerXoff)*xscale + xoff, (voltage[i+1] + yoff)*yscale + ybase);
 
 		//If the current point is right of us, stop
 		if(left.x > gl_GlobalInvocationID.x + 1)