@@ -34,13 +34,12 @@ CBUFFER_END
3434// channels packed together.
3535// The reason for separating channels is to reduce bank conflicts in the local data memory
3636// controller. A large stride will cause more threads to collide on the same memory bank.
37- //
38- // TODO: skip alpha for now, will probably be needed for AR though.
3937groupshared uint gs_cacheR[128];
4038groupshared uint gs_cacheG[128];
4139groupshared uint gs_cacheB[128];
40+ groupshared uint gs_cacheA[128];
4241
43- float3 BlurPixels(float3 a, float3 b, float3 c, float3 d, float3 e, float3 f, float3 g, float3 h, float3 i)
42+ float4 BlurPixels(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h, float4 i)
4443{
4544 return 0.27343750 * (e )
4645 + 0.21875000 * (d + f)
@@ -49,38 +48,41 @@ float3 BlurPixels(float3 a, float3 b, float3 c, float3 d, float3 e, float3 f, fl
4948 + 0.00390625 * (a + i);
5049}
5150
52- void Store2Pixels(uint index, float3 pixel1, float3 pixel2)
51+ void Store2Pixels(uint index, float4 pixel1, float4 pixel2)
5352{
5453 gs_cacheR[index] = f32tof16(pixel1.r) | f32tof16(pixel2.r) << 16;
5554 gs_cacheG[index] = f32tof16(pixel1.g) | f32tof16(pixel2.g) << 16;
5655 gs_cacheB[index] = f32tof16(pixel1.b) | f32tof16(pixel2.b) << 16;
56+ gs_cacheA[index] = f32tof16(pixel1.a) | f32tof16(pixel2.a) << 16;
5757}
5858
59- void Load2Pixels(uint index, out float3 pixel1, out float3 pixel2)
59+ void Load2Pixels(uint index, out float4 pixel1, out float4 pixel2)
6060{
6161 uint rr = gs_cacheR[index];
6262 uint gg = gs_cacheG[index];
6363 uint bb = gs_cacheB[index];
64- pixel1 = float3(f16tof32(rr ), f16tof32(gg ), f16tof32(bb ));
65- pixel2 = float3(f16tof32(rr >> 16), f16tof32(gg >> 16), f16tof32(bb >> 16));
64+ uint aa = gs_cacheA[index];
65+ pixel1 = float4(f16tof32(rr ), f16tof32(gg ), f16tof32(bb ), f16tof32(aa ));
66+ pixel2 = float4(f16tof32(rr >> 16), f16tof32(gg >> 16), f16tof32(bb >> 16), f16tof32(aa >> 16));
6667}
6768
68- void Store1Pixel(uint index, float3 pixel)
69+ void Store1Pixel(uint index, float4 pixel)
6970{
7071 gs_cacheR[index] = asuint(pixel.r);
7172 gs_cacheG[index] = asuint(pixel.g);
7273 gs_cacheB[index] = asuint(pixel.b);
74+ gs_cacheA[index] = asuint(pixel.a);
7375}
7476
75- void Load1Pixel(uint index, out float3 pixel)
77+ void Load1Pixel(uint index, out float4 pixel)
7678{
77- pixel = asfloat(uint3 (gs_cacheR[index], gs_cacheG[index], gs_cacheB[index]));
79+ pixel = asfloat(uint4 (gs_cacheR[index], gs_cacheG[index], gs_cacheB[index], gs_cacheA [index]));
7880}
7981
8082// Blur two pixels horizontally. This reduces LDS reads and pixel unpacking.
8183void BlurHorizontally(uint outIndex, uint leftMostIndex)
8284{
83- float3 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
85+ float4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
8486 Load2Pixels(leftMostIndex + 0, s0, s1);
8587 Load2Pixels(leftMostIndex + 1, s2, s3);
8688 Load2Pixels(leftMostIndex + 2, s4, s5);
@@ -93,7 +95,7 @@ void BlurHorizontally(uint outIndex, uint leftMostIndex)
9395
9496void BlurVertically(uint2 pixelCoord, uint topMostIndex)
9597{
96- float3 s0, s1, s2, s3, s4, s5, s6, s7, s8;
98+ float4 s0, s1, s2, s3, s4, s5, s6, s7, s8;
9799 Load1Pixel(topMostIndex , s0);
98100 Load1Pixel(topMostIndex + 8, s1);
99101 Load1Pixel(topMostIndex + 16, s2);
@@ -104,10 +106,10 @@ void BlurVertically(uint2 pixelCoord, uint topMostIndex)
104106 Load1Pixel(topMostIndex + 56, s7);
105107 Load1Pixel(topMostIndex + 64, s8);
106108
107- float3 blurred = BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8);
109+ float4 blurred = BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8);
108110
109111 // Write to the final target
110- _Result[pixelCoord] = float4( blurred, 1.0) ;
112+ _Result[pixelCoord] = blurred;
111113}
112114
113115#pragma kernel KMain
@@ -119,10 +121,10 @@ void KMain(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, u
119121
120122 // Downsample the block
121123 float2 offset = float2(threadUL);
122- float3 p00 = _Source.SampleLevel(sampler_LinearClamp, (offset + 0.5) * _Size.zw, 0.0).rgb ;
123- float3 p10 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(1.0, 0.0) + 0.5) * _Size.zw, 0.0).rgb ;
124- float3 p01 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(0.0, 1.0) + 0.5) * _Size.zw, 0.0).rgb ;
125- float3 p11 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(1.0, 1.0) + 0.5) * _Size.zw, 0.0).rgb ;
124+ float4 p00 = _Source.SampleLevel(sampler_LinearClamp, (offset + 0.5) * _Size.zw, 0.0);
125+ float4 p10 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(1.0, 0.0) + 0.5) * _Size.zw, 0.0);
126+ float4 p01 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(0.0, 1.0) + 0.5) * _Size.zw, 0.0);
127+ float4 p11 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(1.0, 1.0) + 0.5) * _Size.zw, 0.0);
126128
127129 // Store the 4 downsampled pixels in LDS
128130 uint destIdx = groupThreadId.x + (groupThreadId.y << 4u);
0 commit comments