Featured image of post 2025spring 【Catlike Coding】Compute Shader

2025spring 【Catlike Coding】Compute Shader

【Catlike Coding】Compute Shader

【Catlike Coding】Compute Shader

学了Catlike Coding的basic部分, compute shader部分有点收获,写个总结.

alt text

GPUGraph.cs

通过Shader.propertyToID把数据从C#脚本中传递到shader中(CPU To GPU).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
using UnityEngine;
//This is a script to draw a 3D graph by GPU, using simple mesh.
public class GPUGraph : MonoBehaviour
{
    // [SerializeField] Transform pointPrefab; // Unused in procedural rendering but kept for reference
    [SerializeField] int resolution = 200;
    [SerializeField, Range(0, 2)] private int functionIndex = 0;
    [SerializeField] ComputeShader positionComputeShader;
    [SerializeField] Material proceduralMaterial;
    [SerializeField] Mesh quadMesh;

    ComputeBuffer positionBuffer;
    static readonly int 
        PositionsID = Shader.PropertyToID("_Positions"),
        ResolutionID = Shader.PropertyToID("_Resolution"),
        TimeID = Shader.PropertyToID("_Time"),
        FunctionIndexID = Shader.PropertyToID("_FunctionIndex"); // Added for function selection

    private void Awake()
    {
        // Initialize ComputeBuffer with resolution * resolution elements, each a Vector3 (3 floats)
        positionBuffer = new ComputeBuffer(resolution * resolution, 3 * sizeof(float));
    }

    void Update()
    {
        UpdatePositions();
        ProcedureDraw();
    }

    void UpdatePositions()
    {
        // Calculate thread groups based on resolution, assuming Compute Shader uses 16x16 threads
        int threadGroupsX = Mathf.CeilToInt(resolution / 16.0f);
        int threadGroupsY = Mathf.CeilToInt(resolution / 16.0f);

        // Set Compute Shader parameters
        positionComputeShader.SetInt(ResolutionID, resolution);
        positionComputeShader.SetFloat(TimeID, Time.time);
        positionComputeShader.SetInt(FunctionIndexID, functionIndex); // Set the function index

        // Set the buffer and dispatch the Compute Shader
        int kernel = positionComputeShader.FindKernel("CalculatePositions");
        positionComputeShader.SetBuffer(kernel, PositionsID, positionBuffer);
        positionComputeShader.Dispatch(kernel, threadGroupsX, threadGroupsY, 1);
        
        // Vector3[] positions = new Vector3[resolution * resolution];
        // positionBuffer.GetData(positions);
        // Debug.Log("Position[0]: " + positions[0]);
    }

    void ProcedureDraw()
    {
        // Ensure the material has the latest position buffer
        proceduralMaterial.SetBuffer("_Positions", positionBuffer);
        
        // Draw instanced meshes procedurally
        Graphics.DrawMeshInstancedProcedural(
            quadMesh,
            0,
            proceduralMaterial,
            new Bounds(Vector3.zero, Vector3.one * 10f),
            resolution * resolution
        );
    }

    private void OnDestroy()
    {
        // Release the buffer to prevent memory leaks
        positionBuffer.Release();
    }
}

GraphComputeShader.compute

compute shader 计算好坐标,并返回给缓冲区

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#pragma kernel CalculatePositions

RWStructuredBuffer<float3> _Positions;
uint _Resolution;
float _Time;
uint _FunctionIndex; // 注意:这里是 __FunctionIndex,与你的代码保持一致

#define PI 3.14159265358979323846

// 函数1:原始的绘图函数
float3 Function1(float u, float v, float time)
{
    float r1 = (7.0 + sin(PI * (6.0 * u + time / 2.0))) / 10.0;
    float r2 = (3.0 + sin(PI * (4.0 * v + 8.0 * u + 2.0 * time))) / 20.0;
    float s = r1 + r2 * cos(PI * v);
    
    float3 position;
    position.x = s * sin(PI * u);
    position.z = s * cos(PI * u);
    position.y = r2 * sin(PI * v);
    return position;
}

// 函数2:示例函数(简单平面)
float3 Function2(float u, float v, float time)
{
    float3 position;
    position.x = u;
    position.y = 0.0;
    position.z = v;
    return position;
}

// 函数3:示例函数(波浪面)
float3 Function3(float u, float v, float time)
{
    float3 position;
    position.x = u;
    position.y = sin(u * 5.0 + time) * 0.5;
    position.z = v;
    return position;
}

// 线程组配置
[numthreads(16, 16, 1)]
void CalculatePositions (uint3 id : SV_DispatchThreadID)
{
    // 索引保护,避免越界
    if (id.x >= _Resolution || id.y >= _Resolution) return;
    
    // 归一化计算,确保 u 和 v 在 [-4, 4] 范围内
    float step = 8.0 / (_Resolution - 1);
    float u = -4.0 + step * id.x;
    float v = -4.0 + step * id.y;
    
    // 根据 __FunctionIndex 选择不同的绘图函数
    float3 position;
    switch (_FunctionIndex)
    {
    case 0:
        position = Function1(u, v, _Time);
        break;
    case 1:
        position = Function2(u, v, _Time);
        break;
    case 2:
        position = Function3(u, v, _Time);
        break;
    default:
        position = float3(0, 0, 0); // 默认值,防止未定义行为
        break;
    }
    
    // 计算缓冲区索引并写入结果
    uint index = id.y * _Resolution + id.x; // 行优先存储
    _Positions[index] = position;
}

Catlike shader 是surface shader,在我2022 URP Unity中不能使用. 所以我让AI改为了 URP shader.
通过

1
2
3
#if defined(UNITY_PROCEDURAL_INSTANCING_ENABLED)
// Get the position for this instance
float3 position = _Positions[unity_InstanceID];

来获取缓冲区内的坐标,然后改变unity object to world 矩阵, 来改变坐标. 挺巧妙的.

ProcedureDraw.shader

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
Shader "Custom/ProceduralSurfaceURP" {
    Properties {
        _Scale ("Instance Scale", Range(0.001, 0.05)) = 0.005
    }

    SubShader {
        Tags { "RenderType"="Opaque" "RenderPipeline"="UniversalPipeline" }
        
        Pass {
            HLSLPROGRAM
            #include "Packages/com.unity.render-pipelines.universal/ShaderLibrary/Core.hlsl"
            #pragma vertex vert
            #pragma fragment frag
            #pragma multi_compile_instancing
            #pragma instancing_options procedural:ConfigureProcedural
            #pragma target 4.5

            #if defined(UNITY_PROCEDURAL_INSTANCING_ENABLED)
                StructuredBuffer<float3> _Positions; // Buffer to receive positions from C#
            #endif

            float _Scale;

            struct Attributes {
                float4 positionOS : POSITION;
                UNITY_VERTEX_INPUT_INSTANCE_ID
            };

            struct Varyings {
                float4 positionCS : SV_POSITION;
                float3 worldPos : TEXCOORD0;
            };

            void ConfigureProcedural() {
                #if defined(UNITY_PROCEDURAL_INSTANCING_ENABLED)
                    // Get the position for this instance
                    float3 position = _Positions[unity_InstanceID];
                    
                    // Build the transformation matrix: scale and translate
                    float4x4 mat = float4x4(
                        _Scale, 0,     0,     position.x,
                        0,     _Scale, 0,     position.y,
                        0,     0,     _Scale, position.z,
                        0,     0,     0,     1
                    );
                    
                    // Apply the transformation to UNITY_MATRIX_M (object to world matrix)
                    UNITY_MATRIX_M = mat;
                #endif
            }

            Varyings vert(Attributes input) {
                Varyings output;
                UNITY_SETUP_INSTANCE_ID(input);
                output.positionCS = TransformObjectToHClip(input.positionOS.xyz);
                output.worldPos = TransformObjectToWorld(input.positionOS.xyz);
                return output;
            }
            
            
            half4 frag(Varyings input) : SV_Target {
                half3 color = 0.5 + 0.5 * sin(input.worldPos);

                return half4(color, 1);
            }
            ENDHLSL
        }
    }
}

总结

这一套方案让CPU运算量大大降低. 戴森球的优化应该也采用类似的方法, 把各种CPU的计算转移到GPU中进行. 大概效率提升了一个数量级.

戴森球作者发的专栏 https://www.zhihu.com/question/442555442/answer/1711890146?utm_psn=1883875216381432992

使用 Hugo 构建
主题 StackJimmy 设计