unity-shader-ComputeShader
前篇
GPGPU : 利用GPU做一些非渲染的计算也被称为 GPGPU – General-purpose computing on graphics processing units,图形处理器通用计算
使用限制
todo
貌似还有 compute shader 与 普通shader 交互的, 暂时没去尝试
简介 ComputeShaders是运行在GPU的,不同于传统的渲染流水线。它们能够用来进行庞大的并行图形处理器通用计算,或者是渲染。 常用的计算架构有DirectCompute, OpenGL Compute, OpenCL, CUDA, or OpenCL.
Unity的ComputeShader十分接近DirectCompute(微软推出的,随DirectX11一起发布),Unity引入的Compute Shader 支持如下平台:
Windows and Windows Store, with a DirectX 11 or DirectX 12 graphics API and Shader Model 5.0 GPU
macOS and iOS using Metal graphics API
Android, Linux and Windows platforms with Vulkan API
Modern OpenGL platforms (OpenGL 4.3 on Linux or Windows; OpenGL ES 3.1 on Android). Note that Mac OS X does not support OpenGL 4.3
Modern consoles (Sony PS4 and Microsoft Xbox One)
如果我们在程序的Dispatch接口发送了(5,3,2)这样的结构,就会生成5x3x2个线程组,其中每个组的线程结构由ComputeShader中的numthreads定义,图中numthreads定义了10x8x3的三维结构,由此,我们可以分析4个HLSL关键词的定义。
SV_GroupThreadID 表示该线程在该组内的位置SV_GroupID 表示整个组所分配的位置SV_DispatchThreadID 表示该线程在所有组的线程中的位置SV_GroupIndex 表示该线程在该组内的索引
通过这些关键词,我们可以在并行计算时获取其他线程的输入数据
如果是计算4X4的矩阵加法,可以定义为4X4X1的numthreads结构,这样线程的索引会自动匹配输入的矩阵,同样,我们可以定义16X1X1的结构,但这样只能基于当前线程数去计算输入矩阵(原文是 however it would then have to calculate the current matrix entry based on the current thread number. 没太理解)
SM4.5 允许numthreads最多768条线程
SM5.0 允许numthreads最多1024条线程
效果 主要利用gpu计算位置信息
代码说明 主要参考: https://github.com/chenjd/Unity-Boids-Behavior-on-GPGPU
csharp GPUFlock.cs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 using System.Collections;using System.Collections.Generic;using UnityEngine;public struct GPUBoid { public Vector3 pos; public Vector3 rot; public Vector3 flockPos; public float speed; public float nearbyDis; public float boidsCount; } public class GPUFlock : MonoBehaviour { public ComputeShader cshader; public GameObject boidPrefab; public int boidsCount; public float spawnRadius; public float flockSpeed; public float nearbyDis; private Vector3 targetPos = Vector3.zero; private int kernelHandle; GameObject[] boidsGo; GPUBoid[] boidsData; void Start ( ) { this .boidsGo = new GameObject[this .boidsCount]; this .boidsData = new GPUBoid[this .boidsCount]; this .kernelHandle = cshader.FindKernel("MyCSMain" ); for (int i = 0 ; i < this .boidsCount; i++) { this .boidsData[i] = this .CreateBoidData(); this .boidsGo[i] = Instantiate(boidPrefab, this .boidsData[i].pos, Quaternion.Euler(this .boidsData[i].rot)) as GameObject; this .boidsData[i].rot = this .boidsGo[i].transform.forward; } } GPUBoid CreateBoidData ( ) { GPUBoid boidData = new GPUBoid(); Vector3 pos = transform.position + Random.insideUnitSphere * spawnRadius; Quaternion rot = Quaternion.Slerp(transform.rotation, Random.rotation, 0.3f ); boidData.pos = pos; boidData.flockPos = transform.position; boidData.boidsCount = this .boidsCount; boidData.nearbyDis = this .nearbyDis; boidData.speed = this .flockSpeed + Random.Range(-0.5f , 0.5f ); return boidData; } void Update ( ) { this .targetPos += new Vector3(2f , 5f , 3f ); this .transform.localPosition += new Vector3( (Mathf.Sin(Mathf.Deg2Rad * this .targetPos.x) * -0.2f ), (Mathf.Sin(Mathf.Deg2Rad * this .targetPos.y) * 0.2f ), (Mathf.Sin(Mathf.Deg2Rad * this .targetPos.z) * 0.2f ) ); ComputeBuffer buffer = new ComputeBuffer(boidsCount, 48 ); for (int i = 0 ; i < this .boidsData.Length; i++) { this .boidsData[i].flockPos = this .transform.position; } buffer.SetData(this .boidsData); cshader.SetBuffer(this .kernelHandle, "boidBuffer" , buffer); cshader.SetFloat("deltaTime" , Time.deltaTime); cshader.Dispatch(this .kernelHandle, this .boidsCount, 1 , 1 ); buffer.GetData(this .boidsData); buffer.Release(); for (int i = 0 ; i < this .boidsData.Length; i++) { this .boidsGo[i].transform.localPosition = this .boidsData[i].pos; if (!this .boidsData[i].rot.Equals(Vector3.zero)) { this .boidsGo[i].transform.rotation = Quaternion.LookRotation(this .boidsData[i].rot); } } } }
compute shader Boid.compute
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 #pragma kernel MyCSMain struct Boid { float3 pos; float3 rot; float3 flockPos; float speed; float nearbyDis; float boidsCount; }; RWStructuredBuffer<Boid> boidBuffer; float deltaTime;[numthreads (128 ,1 ,1 )] void MyCSMain (uint3 id : SV_DispatchThreadID) { Boid boid = boidBuffer[id.x]; float3 pos = boid.pos; float3 rot = boid.rot; float3 separation = float3 (0.0 , 0.0 , 0.0 ); float3 alignment = float3 (0.0 , 0.0 , 0.0 ); float3 cohesion = boid.flockPos; float3 tempCohesion = float3 (0.0 , 0.0 , 0.0 ); float tempSpeed = 0 ; uint nearbyCount = 0 ; [loop] for (int i = 0 ; i < int (boid.boidsCount); i++) { if (i != int (id.x)) { Boid tempBoid = boidBuffer[i]; if (length (boid.pos - tempBoid.pos) < boid.nearbyDis) { separation += boid.pos - tempBoid.pos; alignment += tempBoid.rot; tempCohesion += tempBoid.pos; nearbyCount++; } } } if (nearbyCount > 0 ) { alignment *= 1 / nearbyCount; tempCohesion *= 1 / nearbyCount; } cohesion += tempCohesion; float3 direction = alignment + separation + normalize (cohesion - boid.pos); boid.rot = lerp (boid.rot, normalize (direction), deltaTime * 4 ); boid.pos += boid.rot * boid.speed * deltaTime; boidBuffer[id.x] = boid; }