unity-shader-ComputeShader

unity-shader-ComputeShader


前篇

GPGPU : 利用GPU做一些非渲染的计算也被称为 GPGPU – General-purpose computing on graphics processing units,图形处理器通用计算


使用限制

  • PC/Console: DX11/OpenGL desktop sm 4.3+

  • 安卓平台: OpenGL ES 3.1+


todo

  • 貌似还有 compute shader 与 普通shader 交互的, 暂时没去尝试

简介

ComputeShaders是运行在GPU的,不同于传统的渲染流水线。它们能够用来进行庞大的并行图形处理器通用计算,或者是渲染。
常用的计算架构有DirectCompute, OpenGL Compute, OpenCL, CUDA, or OpenCL.

Unity的ComputeShader十分接近DirectCompute(微软推出的,随DirectX11一起发布),Unity引入的Compute Shader
支持如下平台:

  • Windows and Windows Store, with a DirectX 11 or DirectX 12 graphics API and Shader Model 5.0 GPU
  • macOS and iOS using Metal graphics API
  • Android, Linux and Windows platforms with Vulkan API
  • Modern OpenGL platforms (OpenGL 4.3 on Linux or Windows; OpenGL ES 3.1 on Android). Note that Mac OS X does not support OpenGL 4.3
  • Modern consoles (Sony PS4 and Microsoft Xbox One)

如果我们在程序的Dispatch接口发送了(5,3,2)这样的结构,就会生成5x3x2个线程组,其中每个组的线程结构由ComputeShader中的numthreads定义,图中numthreads定义了10x8x3的三维结构,由此,我们可以分析4个HLSL关键词的定义。

SV_GroupThreadID 表示该线程在该组内的位置
SV_GroupID 表示整个组所分配的位置
SV_DispatchThreadID 表示该线程在所有组的线程中的位置
SV_GroupIndex 表示该线程在该组内的索引

通过这些关键词,我们可以在并行计算时获取其他线程的输入数据

如果是计算4X4的矩阵加法,可以定义为4X4X1的numthreads结构,这样线程的索引会自动匹配输入的矩阵,同样,我们可以定义16X1X1的结构,但这样只能基于当前线程数去计算输入矩阵(原文是 however it would then have to calculate the current matrix entry based on the current thread number. 没太理解)

SM4.5 允许numthreads最多768条线程

SM5.0 允许numthreads最多1024条线程


效果

主要利用gpu计算位置信息


代码说明

主要参考: https://github.com/chenjd/Unity-Boids-Behavior-on-GPGPU

csharp

GPUFlock.cs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
using System.Collections;
using System.Collections.Generic;
using UnityEngine;

// 这个结构体字段要与 cs 中的完全一致, cpu 与 gpu 数据交互的载体
public struct GPUBoid {
public Vector3 pos;
public Vector3 rot;
public Vector3 flockPos;
public float speed;
public float nearbyDis;
public float boidsCount;
}

public class GPUFlock : MonoBehaviour {

public ComputeShader cshader;

public GameObject boidPrefab;
public int boidsCount;
public float spawnRadius;
public float flockSpeed;
public float nearbyDis;

private Vector3 targetPos = Vector3.zero;
private int kernelHandle;

GameObject[] boidsGo;
GPUBoid[] boidsData;

void Start() {
this.boidsGo = new GameObject[this.boidsCount];
this.boidsData = new GPUBoid[this.boidsCount];
this.kernelHandle = cshader.FindKernel("MyCSMain"); // 找到句柄

for (int i = 0; i < this.boidsCount; i++) {
this.boidsData[i] = this.CreateBoidData();
this.boidsGo[i] = Instantiate(boidPrefab, this.boidsData[i].pos, Quaternion.Euler(this.boidsData[i].rot)) as GameObject;
this.boidsData[i].rot = this.boidsGo[i].transform.forward;
}
}

GPUBoid CreateBoidData() {
GPUBoid boidData = new GPUBoid();
Vector3 pos = transform.position + Random.insideUnitSphere * spawnRadius;
Quaternion rot = Quaternion.Slerp(transform.rotation, Random.rotation, 0.3f);
boidData.pos = pos;
boidData.flockPos = transform.position;
boidData.boidsCount = this.boidsCount;
boidData.nearbyDis = this.nearbyDis;
boidData.speed = this.flockSpeed + Random.Range(-0.5f, 0.5f);

return boidData;
}

void Update() {

this.targetPos += new Vector3(2f, 5f, 3f);
this.transform.localPosition += new Vector3(
(Mathf.Sin(Mathf.Deg2Rad * this.targetPos.x) * -0.2f),
(Mathf.Sin(Mathf.Deg2Rad * this.targetPos.y) * 0.2f),
(Mathf.Sin(Mathf.Deg2Rad * this.targetPos.z) * 0.2f)
);

ComputeBuffer buffer = new ComputeBuffer(boidsCount, 48);

for (int i = 0; i < this.boidsData.Length; i++) {
this.boidsData[i].flockPos = this.transform.position;
}

buffer.SetData(this.boidsData); // 设置需要计算的数据数组 this.boidsData
cshader.SetBuffer(this.kernelHandle, "boidBuffer", buffer); // 上传一个 buffer
cshader.SetFloat("deltaTime", Time.deltaTime); // 上传一个基础的uniform变量
cshader.Dispatch(this.kernelHandle, this.boidsCount, 1, 1); // 执行 cs, gpu 并行计算
buffer.GetData(this.boidsData); // 将数据从 GPU 传回到 CPU 中

buffer.Release();

for (int i = 0; i < this.boidsData.Length; i++) {
this.boidsGo[i].transform.localPosition = this.boidsData[i].pos;
if (!this.boidsData[i].rot.Equals(Vector3.zero)) {
this.boidsGo[i].transform.rotation = Quaternion.LookRotation(this.boidsData[i].rot);
}
}
}

}

compute shader

Boid.compute

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// csharp 中要找的就这个名字 MyCSMain, 且下面的方法名要和这个一致
#pragma kernel MyCSMain

//封装计算单个boid时所需要的数据
struct Boid
{
float3 pos;
float3 rot;
float3 flockPos;
float speed;
float nearbyDis;
float boidsCount;
};

RWStructuredBuffer<Boid> boidBuffer;
float deltaTime;

//Compute Shader执行的线程组,每个线程组又包含多个线程 ,默认创建的[numthreads(8,8,1)]
//[numthreads(8,8,1)] 的意思就是在这个线程组中分配了8*8*1=64个线程,当然也可以用[numthreads(64,1,1)] 表示
//这里自己改下
[numthreads(128,1,1)]
void MyCSMain (uint3 id : SV_DispatchThreadID)
{
Boid boid = boidBuffer[id.x];

float3 pos = boid.pos;
float3 rot = boid.rot;

//separation
float3 separation = float3(0.0, 0.0, 0.0);

//alignment
float3 alignment = float3(0.0, 0.0, 0.0);

//cohesion
float3 cohesion = boid.flockPos;
float3 tempCohesion = float3(0.0, 0.0, 0.0);

float tempSpeed = 0;
uint nearbyCount = 0;


[loop]
for (int i = 0; i < int(boid.boidsCount); i++)
{
if (i != int(id.x))
{
Boid tempBoid = boidBuffer[i];
if (length(boid.pos - tempBoid.pos) < boid.nearbyDis)
{
separation += boid.pos - tempBoid.pos;

alignment += tempBoid.rot;

tempCohesion += tempBoid.pos;

nearbyCount++;
}
}
}

if (nearbyCount > 0)
{
alignment *= 1 / nearbyCount;
tempCohesion *= 1 / nearbyCount;
}

cohesion += tempCohesion;
float3 direction = alignment + separation + normalize(cohesion - boid.pos);
boid.rot = lerp(boid.rot, normalize(direction), deltaTime * 4);
boid.pos += boid.rot * boid.speed * deltaTime;
boidBuffer[id.x] = boid;
}