363 lines
14 KiB
C++
363 lines
14 KiB
C++
/*
|
|
* Copyright 2022 Google LLC
|
|
*
|
|
* Use of this source code is governed by a BSD-style license that can be
|
|
* found in the LICENSE file.
|
|
*/
|
|
|
|
#include "tests/Test.h"
|
|
|
|
#include "include/gpu/graphite/Context.h"
|
|
#include "include/gpu/graphite/Recorder.h"
|
|
#include "include/gpu/graphite/Recording.h"
|
|
#include "src/gpu/graphite/Buffer.h"
|
|
#include "src/gpu/graphite/Caps.h"
|
|
#include "src/gpu/graphite/ComputePassTask.h"
|
|
#include "src/gpu/graphite/ComputePipelineDesc.h"
|
|
#include "src/gpu/graphite/ComputeTypes.h"
|
|
#include "src/gpu/graphite/RecorderPriv.h"
|
|
#include "src/gpu/graphite/ResourceProvider.h"
|
|
#include "src/gpu/graphite/SynchronizeToCpuTask.h"
|
|
#include "src/gpu/graphite/compute/ComputeStep.h"
|
|
|
|
using namespace skgpu::graphite;
|
|
|
|
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
|
|
// compute programs.
|
|
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeTaskTest, reporter, context) {
|
|
constexpr uint32_t kProblemSize = 512;
|
|
constexpr float kFactor = 4.f;
|
|
|
|
std::unique_ptr<Recorder> recorder = context->makeRecorder();
|
|
|
|
class TestComputeStep : public ComputeStep {
|
|
public:
|
|
TestComputeStep() : ComputeStep("TestArrayMultiply", {}, {}) {}
|
|
~TestComputeStep() override = default;
|
|
|
|
// A kernel that multiplies a large array of floats by a supplied factor.
|
|
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
|
|
return R"(
|
|
layout(set=0, binding=0) readonly buffer inputBlock
|
|
{
|
|
float factor;
|
|
float in_data[];
|
|
};
|
|
layout(set=0, binding=1) buffer outputBlock
|
|
{
|
|
float out_data[];
|
|
};
|
|
void main() {
|
|
out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
|
|
}
|
|
)";
|
|
}
|
|
};
|
|
TestComputeStep step;
|
|
ComputePipelineDesc pipelineDesc(&step);
|
|
|
|
ResourceProvider* provider = recorder->priv().resourceProvider();
|
|
size_t inputSize = SkAlignTo(sizeof(float) * (kProblemSize + 1),
|
|
recorder->priv().caps()->requiredStorageBufferAlignment());
|
|
sk_sp<Buffer> inputBuffer = provider->findOrCreateBuffer(
|
|
inputSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
|
|
size_t outputSize = SkAlignTo(sizeof(float) * kProblemSize,
|
|
recorder->priv().caps()->requiredStorageBufferAlignment());
|
|
sk_sp<Buffer> outputBuffer = provider->findOrCreateBuffer(
|
|
outputSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
|
|
|
|
std::vector<ResourceBinding> bindings;
|
|
bindings.push_back({/*index=*/0, {inputBuffer.get(), /*offset=*/0}});
|
|
bindings.push_back({/*index=*/1, {outputBuffer.get(), /*offset=*/0}});
|
|
|
|
// Initialize "in_data" to contain an ascending sequence of integers.
|
|
// Initialize "out_data" to "-1"s.
|
|
{
|
|
float* inData = static_cast<float*>(inputBuffer->map());
|
|
float* outData = static_cast<float*>(outputBuffer->map());
|
|
SkASSERT(inputBuffer->isMapped() && inData != nullptr);
|
|
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
|
|
|
|
inData[0] = kFactor; // "in_factor"
|
|
for (unsigned int i = 0; i < kProblemSize; ++i) {
|
|
inData[i + 1] = i + 1;
|
|
outData[i] = -1;
|
|
}
|
|
inputBuffer->unmap();
|
|
outputBuffer->unmap();
|
|
}
|
|
|
|
ComputePassDesc desc;
|
|
desc.fLocalDispatchSize = WorkgroupSize(kProblemSize, 1, 1);
|
|
|
|
// Record the compute pass task.
|
|
recorder->priv().add(ComputePassTask::Make(std::move(bindings), pipelineDesc, desc));
|
|
|
|
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
|
|
recorder->priv().add(SynchronizeToCpuTask::Make(outputBuffer));
|
|
|
|
// Submit the work and wait for it to complete.
|
|
std::unique_ptr<Recording> recording = recorder->snap();
|
|
if (!recording) {
|
|
ERRORF(reporter, "Failed to make recording");
|
|
return;
|
|
}
|
|
|
|
InsertRecordingInfo insertInfo;
|
|
insertInfo.fRecording = recording.get();
|
|
context->insertRecording(insertInfo);
|
|
context->submit(SyncToCpu::kYes);
|
|
|
|
// Verify the contents of the output buffer.
|
|
{
|
|
float* inData = static_cast<float*>(inputBuffer->map());
|
|
float* outData = static_cast<float*>(outputBuffer->map());
|
|
SkASSERT(inputBuffer->isMapped() && inData != nullptr);
|
|
SkASSERT(outputBuffer->isMapped() && outData != nullptr);
|
|
for (unsigned int i = 0; i < kProblemSize; ++i) {
|
|
const float expected = inData[i + 1] * kFactor;
|
|
const float found = outData[i];
|
|
REPORTER_ASSERT(
|
|
reporter, expected == found, "expected '%f', found '%f'", expected, found);
|
|
}
|
|
inputBuffer->unmap();
|
|
outputBuffer->unmap();
|
|
}
|
|
}
|
|
|
|
// TODO(b/260622403): The shader tested here is identical to
|
|
// `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
|
|
// features like this as part of SkSLTest.cpp instead of as a graphite test.
|
|
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
|
|
// compute programs.
|
|
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeShaderAtomicOperationsTest, reporter, context) {
|
|
std::unique_ptr<Recorder> recorder = context->makeRecorder();
|
|
|
|
class TestComputeStep : public ComputeStep {
|
|
public:
|
|
TestComputeStep() : ComputeStep("TestAtomicOperations", {}, {}) {}
|
|
~TestComputeStep() override = default;
|
|
|
|
// A kernel that increments a global (device memory) counter across multiple workgroups.
|
|
// Each workgroup maintains its own independent tally in a workgroup-shared counter which
|
|
// is then added to the global count.
|
|
//
|
|
// This exercises atomic store/load/add and coherent reads and writes over memory in storage
|
|
// and workgroup address spaces.
|
|
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
|
|
return R"(
|
|
layout(metal, binding = 0) buffer ssbo {
|
|
atomicUint globalCounter;
|
|
};
|
|
|
|
workgroup atomicUint localCounter;
|
|
|
|
void main() {
|
|
// Initialize the local counter.
|
|
if (sk_LocalInvocationID.x == 0) {
|
|
atomicStore(localCounter, 0);
|
|
}
|
|
|
|
// Synchronize the threads in the workgroup so they all see the initial value.
|
|
workgroupBarrier();
|
|
|
|
// All threads increment the counter.
|
|
atomicAdd(localCounter, 1);
|
|
|
|
// Synchronize the threads again to ensure they have all executed the increment
|
|
// and the following load reads the same value across all threads in the
|
|
// workgroup.
|
|
workgroupBarrier();
|
|
|
|
// Add the workgroup-only tally to the global counter.
|
|
if (sk_LocalInvocationID.x == 0) {
|
|
atomicAdd(globalCounter, atomicLoad(localCounter));
|
|
}
|
|
}
|
|
)";
|
|
}
|
|
};
|
|
TestComputeStep step;
|
|
ComputePipelineDesc pipelineDesc(&step);
|
|
|
|
ResourceProvider* provider = recorder->priv().resourceProvider();
|
|
size_t minSize = SkAlignTo(sizeof(uint32_t),
|
|
recorder->priv().caps()->requiredStorageBufferAlignment());
|
|
sk_sp<Buffer> ssbo = provider->findOrCreateBuffer(
|
|
minSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
|
|
|
|
std::vector<ResourceBinding> bindings;
|
|
bindings.push_back({/*index=*/0, {ssbo.get(), /*offset=*/0}});
|
|
|
|
// Initialize the global counter to 0.
|
|
{
|
|
uint32_t* ssboData = static_cast<uint32_t*>(ssbo->map());
|
|
ssboData[0] = 0;
|
|
ssbo->unmap();
|
|
}
|
|
|
|
constexpr uint32_t kWorkgroupCount = 32;
|
|
constexpr uint32_t kWorkgroupSize = 1024;
|
|
|
|
ComputePassDesc desc;
|
|
desc.fGlobalDispatchSize = WorkgroupSize(kWorkgroupCount, 1, 1);
|
|
desc.fLocalDispatchSize = WorkgroupSize(kWorkgroupSize, 1, 1);
|
|
|
|
// Record the compute pass task.
|
|
recorder->priv().add(ComputePassTask::Make(std::move(bindings), pipelineDesc, desc));
|
|
|
|
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
|
|
recorder->priv().add(SynchronizeToCpuTask::Make(ssbo));
|
|
|
|
// Submit the work and wait for it to complete.
|
|
std::unique_ptr<Recording> recording = recorder->snap();
|
|
if (!recording) {
|
|
ERRORF(reporter, "Failed to make recording");
|
|
return;
|
|
}
|
|
|
|
InsertRecordingInfo insertInfo;
|
|
insertInfo.fRecording = recording.get();
|
|
context->insertRecording(insertInfo);
|
|
context->submit(SyncToCpu::kYes);
|
|
|
|
// Verify the contents of the output buffer.
|
|
{
|
|
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
|
|
const uint32_t result = static_cast<const uint32_t*>(ssbo->map())[0];
|
|
REPORTER_ASSERT(reporter,
|
|
result == kExpectedCount,
|
|
"expected '%d', found '%d'",
|
|
kExpectedCount, result);
|
|
ssbo->unmap();
|
|
}
|
|
}
|
|
|
|
// TODO(b/260622403): The shader tested here is identical to
|
|
// `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
|
|
// to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
|
|
// TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
|
|
// compute programs.
|
|
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeShaderAtomicOperationsOverArrayAndStructTest,
|
|
reporter,
|
|
context) {
|
|
std::unique_ptr<Recorder> recorder = context->makeRecorder();
|
|
|
|
class TestComputeStep : public ComputeStep {
|
|
public:
|
|
TestComputeStep() : ComputeStep("TestAtomicOperationsOverArrayAndStruct", {}, {}) {}
|
|
~TestComputeStep() override = default;
|
|
|
|
// Construct a kernel that increments a two global (device memory) counters across multiple
|
|
// workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
|
|
// counters which are then added to the global counts.
|
|
//
|
|
// This exercises atomic store/load/add and coherent reads and writes over memory in storage
|
|
// and workgroup address spaces.
|
|
std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
|
|
return R"(
|
|
const uint WORKGROUP_SIZE = 1024;
|
|
|
|
struct GlobalCounts {
|
|
atomicUint firstHalfCount;
|
|
atomicUint secondHalfCount;
|
|
};
|
|
layout(metal, binding = 0) buffer ssbo {
|
|
GlobalCounts globalCounts;
|
|
};
|
|
|
|
workgroup atomicUint localCounts[2];
|
|
|
|
void main() {
|
|
// Initialize the local counts.
|
|
if (sk_LocalInvocationID.x == 0) {
|
|
atomicStore(localCounts[0], 0);
|
|
atomicStore(localCounts[1], 0);
|
|
}
|
|
|
|
// Synchronize the threads in the workgroup so they all see the initial value.
|
|
workgroupBarrier();
|
|
|
|
// Each thread increments one of the local counters based on its invocation
|
|
// index.
|
|
uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
|
|
atomicAdd(localCounts[idx], 1);
|
|
|
|
// Synchronize the threads again to ensure they have all executed the increments
|
|
// and the following load reads the same value across all threads in the
|
|
// workgroup.
|
|
workgroupBarrier();
|
|
|
|
// Add the workgroup-only tally to the global counter.
|
|
if (sk_LocalInvocationID.x == 0) {
|
|
atomicAdd(globalCounts.firstHalfCount, atomicLoad(localCounts[0]));
|
|
atomicAdd(globalCounts.secondHalfCount, atomicLoad(localCounts[1]));
|
|
}
|
|
}
|
|
)";
|
|
}
|
|
};
|
|
TestComputeStep step;
|
|
ComputePipelineDesc pipelineDesc(&step);
|
|
|
|
ResourceProvider* provider = recorder->priv().resourceProvider();
|
|
size_t minSize = SkAlignTo(2*sizeof(uint32_t),
|
|
recorder->priv().caps()->requiredStorageBufferAlignment());
|
|
sk_sp<Buffer> ssbo = provider->findOrCreateBuffer(
|
|
minSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
|
|
|
|
std::vector<ResourceBinding> bindings;
|
|
bindings.push_back({/*index=*/0, {ssbo.get(), /*offset=*/0}});
|
|
|
|
// Initialize the global counter to 0.
|
|
{
|
|
uint32_t* ssboData = static_cast<uint32_t*>(ssbo->map());
|
|
ssboData[0] = 0;
|
|
ssboData[1] = 0;
|
|
ssbo->unmap();
|
|
}
|
|
|
|
constexpr uint32_t kWorkgroupCount = 32;
|
|
constexpr uint32_t kWorkgroupSize = 1024;
|
|
|
|
ComputePassDesc desc;
|
|
desc.fGlobalDispatchSize = WorkgroupSize(kWorkgroupCount, 1, 1);
|
|
desc.fLocalDispatchSize = WorkgroupSize(kWorkgroupSize, 1, 1);
|
|
|
|
// Record the compute pass task.
|
|
recorder->priv().add(ComputePassTask::Make(std::move(bindings), pipelineDesc, desc));
|
|
|
|
// Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
|
|
recorder->priv().add(SynchronizeToCpuTask::Make(ssbo));
|
|
|
|
// Submit the work and wait for it to complete.
|
|
std::unique_ptr<Recording> recording = recorder->snap();
|
|
if (!recording) {
|
|
ERRORF(reporter, "Failed to make recording");
|
|
return;
|
|
}
|
|
|
|
InsertRecordingInfo insertInfo;
|
|
insertInfo.fRecording = recording.get();
|
|
context->insertRecording(insertInfo);
|
|
context->submit(SyncToCpu::kYes);
|
|
|
|
// Verify the contents of the output buffer.
|
|
{
|
|
constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
|
|
|
|
const uint32_t* ssboData = static_cast<const uint32_t*>(ssbo->map());
|
|
const uint32_t firstHalfCount = ssboData[0];
|
|
const uint32_t secondHalfCount = ssboData[1];
|
|
REPORTER_ASSERT(reporter,
|
|
firstHalfCount == kExpectedCount,
|
|
"expected '%d', found '%d'",
|
|
kExpectedCount, firstHalfCount);
|
|
REPORTER_ASSERT(reporter,
|
|
secondHalfCount == kExpectedCount,
|
|
"expected '%d', found '%d'",
|
|
kExpectedCount, secondHalfCount);
|
|
ssbo->unmap();
|
|
}
|
|
}
|