unity-shader-ComputeShader

分类于 Unity3d-Shader

unity-shader-ComputeShader

前篇

官网文档 - https://docs.unity3d.com/Manual/class-ComputeShader.html
初探Compute Shader - https://blog.csdn.net/notmz/article/details/75547759
Unity中如何实现类似C# Job系统的效果 - https://gameinstitute.qq.com/community/detail/117312
- 里面有个demo仓库 - https://github.com/chenjd/Unity-Boids-Behavior-on-GPGPU
github 相关的仓库 - https://github.com/search?q=unity+compute+shader
- 这个不错, star挺高的 - https://github.com/keijiro/Swarm

GPGPU : 利用GPU做一些非渲染的计算也被称为 GPGPU – General-purpose computing on graphics processing units，图形处理器通用计算

使用限制

PC/Console: DX11/OpenGL desktop sm 4.3+
安卓平台: OpenGL ES 3.1+

todo

貌似还有 compute shader 与普通shader 交互的, 暂时没去尝试

简介

ComputeShaders是运行在GPU的，不同于传统的渲染流水线。它们能够用来进行庞大的并行图形处理器通用计算，或者是渲染。
常用的计算架构有DirectCompute, OpenGL Compute, OpenCL, CUDA, or OpenCL.

Unity的ComputeShader十分接近DirectCompute（微软推出的，随DirectX11一起发布）,Unity引入的Compute Shader
支持如下平台：

Windows and Windows Store, with a DirectX 11 or DirectX 12 graphics API and Shader Model 5.0 GPU
macOS and iOS using Metal graphics API
Android, Linux and Windows platforms with Vulkan API
Modern OpenGL platforms (OpenGL 4.3 on Linux or Windows; OpenGL ES 3.1 on Android). Note that Mac OS X does not support OpenGL 4.3
Modern consoles (Sony PS4 and Microsoft Xbox One)

如果我们在程序的Dispatch接口发送了（5，3，2）这样的结构，就会生成5x3x2个线程组，其中每个组的线程结构由ComputeShader中的numthreads定义，图中numthreads定义了10x8x3的三维结构，由此，我们可以分析4个HLSL关键词的定义。

SV_GroupThreadID 表示该线程在该组内的位置
SV_GroupID 表示整个组所分配的位置
SV_DispatchThreadID 表示该线程在所有组的线程中的位置
SV_GroupIndex 表示该线程在该组内的索引

通过这些关键词，我们可以在并行计算时获取其他线程的输入数据

如果是计算4X4的矩阵加法，可以定义为4X4X1的numthreads结构，这样线程的索引会自动匹配输入的矩阵，同样，我们可以定义16X1X1的结构，但这样只能基于当前线程数去计算输入矩阵（原文是 however it would then have to calculate the current matrix entry based on the current thread number. 没太理解）

SM4.5 允许numthreads最多768条线程

SM5.0 允许numthreads最多1024条线程

效果

主要利用gpu计算位置信息

代码说明

主要参考: https://github.com/chenjd/Unity-Boids-Behavior-on-GPGPU

csharp

GPUFlock.cs

using System.Collections;
using System.Collections.Generic;
using UnityEngine;

// 这个结构体字段要与 cs 中的完全一致, cpu 与 gpu 数据交互的载体
public struct GPUBoid {
    public Vector3 pos;
    public Vector3 rot;
    public Vector3 flockPos;
    public float speed;
    public float nearbyDis;
    public float boidsCount;
}

public class GPUFlock : MonoBehaviour {

    public ComputeShader cshader;

    public GameObject boidPrefab;
    public int boidsCount;
    public float spawnRadius;
    public float flockSpeed;
    public float nearbyDis;

    private Vector3 targetPos = Vector3.zero;
    private int kernelHandle;

    GameObject[] boidsGo;
    GPUBoid[] boidsData;

    void Start() {
        this.boidsGo = new GameObject[this.boidsCount];
        this.boidsData = new GPUBoid[this.boidsCount];
        this.kernelHandle = cshader.FindKernel("MyCSMain"); // 找到句柄

        for (int i = 0; i < this.boidsCount; i++) {
            this.boidsData[i] = this.CreateBoidData();
            this.boidsGo[i] = Instantiate(boidPrefab, this.boidsData[i].pos, Quaternion.Euler(this.boidsData[i].rot)) as GameObject;
            this.boidsData[i].rot = this.boidsGo[i].transform.forward;
        }
    }

    GPUBoid CreateBoidData() {
        GPUBoid boidData = new GPUBoid();
        Vector3 pos = transform.position + Random.insideUnitSphere * spawnRadius;
        Quaternion rot = Quaternion.Slerp(transform.rotation, Random.rotation, 0.3f);
        boidData.pos = pos;
        boidData.flockPos = transform.position;
        boidData.boidsCount = this.boidsCount;
        boidData.nearbyDis = this.nearbyDis;
        boidData.speed = this.flockSpeed + Random.Range(-0.5f, 0.5f);

        return boidData;
    }

    void Update() {

        this.targetPos += new Vector3(2f, 5f, 3f);
        this.transform.localPosition += new Vector3(
            (Mathf.Sin(Mathf.Deg2Rad * this.targetPos.x) * -0.2f),
            (Mathf.Sin(Mathf.Deg2Rad * this.targetPos.y) * 0.2f),
            (Mathf.Sin(Mathf.Deg2Rad * this.targetPos.z) * 0.2f)
        );

        ComputeBuffer buffer = new ComputeBuffer(boidsCount, 48);

        for (int i = 0; i < this.boidsData.Length; i++) {
            this.boidsData[i].flockPos = this.transform.position;
        }

        buffer.SetData(this.boidsData); // 设置需要计算的数据数组 this.boidsData
        cshader.SetBuffer(this.kernelHandle, "boidBuffer", buffer); // 上传一个 buffer
        cshader.SetFloat("deltaTime", Time.deltaTime); // 上传一个基础的uniform变量
        cshader.Dispatch(this.kernelHandle, this.boidsCount, 1, 1); // 执行 cs, gpu 并行计算
        buffer.GetData(this.boidsData); // 将数据从 GPU 传回到 CPU 中

        buffer.Release();

        for (int i = 0; i < this.boidsData.Length; i++) {
            this.boidsGo[i].transform.localPosition = this.boidsData[i].pos;
            if (!this.boidsData[i].rot.Equals(Vector3.zero)) {
                this.boidsGo[i].transform.rotation = Quaternion.LookRotation(this.boidsData[i].rot);
            }
        }
    }

}

compute shader

Boid.compute

// csharp 中要找的就这个名字 MyCSMain, 且下面的方法名要和这个一致
#pragma kernel MyCSMain

//封装计算单个boid时所需要的数据
struct Boid
{
	float3 pos;
	float3 rot;
	float3 flockPos;
	float speed;
	float nearbyDis;
	float boidsCount;
};

RWStructuredBuffer<Boid> boidBuffer;
float deltaTime;

//Compute Shader执行的线程组，每个线程组又包含多个线程 ，默认创建的[numthreads(8,8,1)]
//[numthreads(8,8,1)] 的意思就是在这个线程组中分配了8*8*1=64个线程，当然也可以用[numthreads(64,1,1)] 表示
//这里自己改下
[numthreads(128,1,1)]
void MyCSMain (uint3 id : SV_DispatchThreadID)
{
	Boid boid = boidBuffer[id.x];

	float3 pos = boid.pos;
	float3 rot = boid.rot;

	//separation
	float3 separation = float3(0.0, 0.0, 0.0);

	//alignment
	float3 alignment = float3(0.0, 0.0, 0.0);

	//cohesion
	float3 cohesion = boid.flockPos;
	float3 tempCohesion = float3(0.0, 0.0, 0.0);

    float tempSpeed = 0;
	uint nearbyCount = 0;


	[loop]
	for (int i = 0; i < int(boid.boidsCount); i++)
	{
		if (i != int(id.x))
		{
			Boid tempBoid = boidBuffer[i];
			if (length(boid.pos - tempBoid.pos) < boid.nearbyDis)
			{
				separation += boid.pos - tempBoid.pos;

				alignment += tempBoid.rot;

				tempCohesion += tempBoid.pos;

				nearbyCount++;
			}
		}
	}

	if (nearbyCount > 0)
	{
		alignment *= 1 / nearbyCount;
		tempCohesion *= 1 / nearbyCount;
	}

    cohesion += tempCohesion;
	float3 direction = alignment + separation + normalize(cohesion - boid.pos);
	boid.rot = lerp(boid.rot, normalize(direction), deltaTime * 4);
	boid.pos += boid.rot * boid.speed * deltaTime;
	boidBuffer[id.x] = boid;
}