From b6610a242fead81328d974299ade858cc6b899b6 Mon Sep 17 00:00:00 2001 From: halx99 Date: Fri, 2 Jul 2021 00:18:02 +0800 Subject: [PATCH] Add decompress astc parallel support [ci build] a. Define macro ASTC_ENABLE_PARALLEL_DECOMPRESS to enable. --- cocos/base/astc.cpp | 203 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 200 insertions(+), 3 deletions(-) diff --git a/cocos/base/astc.cpp b/cocos/base/astc.cpp index 286d7eecaa..8f6aaceb77 100644 --- a/cocos/base/astc.cpp +++ b/cocos/base/astc.cpp @@ -8,9 +8,203 @@ #include "astc/astcenc.h" #include "astc/astcenc_internal.h" -uint8_t astc_decompress_image( - const uint8_t* in, uint32_t inlen, uint8_t* out, uint32_t dim_x, uint32_t dim_y, uint32_t block_x, uint32_t block_y) { +#include "yasio/detail/utils.hpp" +// #define ASTC_ENABLE_PARALLEL_DECOMPRESS 1 + +#if defined(ASTC_ENABLE_PARALLEL_DECOMPRESS) +typedef std::mutex astc_decompress_mutex_t; + +struct astc_decompress_task { + astc_decompress_task() { + + _nref = 1; + } + + void addRef() { + ++_nref; + } + + void release() { + if (--_nref == 0) + delete this; + } + + astcenc_config config{}; + + const uint8_t* _in_texels = nullptr; + void* _out_texels[1]{}; + astcenc_context* _context = nullptr; + astcenc_image _image_out; + + std::atomic _nref; +}; + +class astc_decompress_job_manager { +public: + static int s_thread_count; + + static astc_decompress_job_manager* get_instance() { + static astc_decompress_job_manager s_instance; + return &s_instance; + } + astc_decompress_job_manager() { + s_thread_count = std::thread::hardware_concurrency(); + + for (int i = 0; i < s_thread_count; ++i) { + _threads.push_back(std::thread{&astc_decompress_job_manager::run, this}); + } + } + + ~astc_decompress_job_manager() { + _stopped = true; + + _taskQueueMtx.lock(); + for (auto task : _taskQueue) + task->release(); + _taskQueue.clear(); + _taskQueueCV.notify_all(); + _taskQueueMtx.unlock(); + + for (auto& t : _threads) { + if (t.joinable()) + t.join(); + } + _threads.clear(); + } + + void decompress_parallel_sync( + const uint8_t* in, uint8_t* out, unsigned int dim_x, unsigned int dim_y, int block_x, int block_y) { + auto task = new astc_decompress_task(); + init_task(task, in, out, dim_x, dim_y, block_x, block_y); + + _taskQueueMtx.lock(); + _taskQueue.push_back(task); + _taskQueueMtx.unlock(); + _taskQueueCV.notify_all(); // notify all to work for the single decompress task + + task->_context->manage_decompress.wait(); + + _taskQueueMtx.lock(); + if (!_taskQueue.empty()) { + auto t = _taskQueue.front(); + assert(t == task); + _taskQueue.pop_front(); + t->release(); + } + _taskQueueMtx.unlock(); + } + +public: + void init_task(astc_decompress_task* task, const uint8_t* in, uint8_t* out, unsigned int dim_x, unsigned int dim_y, + int block_x, int block_y) { + astcenc_error status = + astcenc_config_init(ASTCENC_PRF_LDR, block_x, block_y, 1, 0, ASTCENC_FLG_DECOMPRESS_ONLY, &task->config); + status = astcenc_context_alloc(&task->config, s_thread_count, &task->_context); + task->_in_texels = in; + task->_out_texels[0] = out; + task->_image_out = {dim_x, dim_y, 1, ASTCENC_TYPE_U8, task->_out_texels}; + + // Only the first thread actually runs the initializer + unsigned int xblocks = (task->_image_out.dim_x + block_x - 1) / block_x; + unsigned int yblocks = (task->_image_out.dim_y + block_y - 1) / block_y; + unsigned int zblocks = (task->_image_out.dim_z); + task->_context->manage_decompress.init(zblocks * yblocks * xblocks); + } + + void run() { + const astcenc_swizzle swz_decode{ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A}; + + bool no_task_count = false; + + for (;;) { + std::unique_lock lck(_taskQueueMtx); + if (!_stopped && (_taskQueue.empty() || no_task_count)) + _taskQueueCV.wait(lck); + + if (_stopped) + break; + + if (_taskQueue.empty()) + continue; + lck.unlock(); // unlock make sure other thread can work for the task + + auto task = _taskQueue.front(); + auto ctx = task->_context; + auto& image_out = task->_image_out; + + unsigned int block_x = ctx->config.block_x; + unsigned int block_y = ctx->config.block_y; + unsigned int block_z = ctx->config.block_z; + + unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; + unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; + unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; + + int row_blocks = xblocks; + int plane_blocks = xblocks * yblocks; + + // Check we have enough output space (16 bytes per block) + /*size_t size_needed = xblocks * yblocks * zblocks * 16; + if (data_len < size_needed) { + return ASTCENC_ERR_OUT_OF_MEM; + }*/ + + image_block blk; + + auto data = task->_in_texels; + for (;;) { // process the task + unsigned int count = 0; + unsigned int base = ctx->manage_decompress.get_task_assignment(128, count); + + no_task_count = !count; + if (no_task_count) { // this thread will going to suspend until new task added + break; + } + + for (unsigned int i = base; i < base + count; i++) { + // Decode i into x, y, z block indices + int z = i / plane_blocks; + unsigned int rem = i - (z * plane_blocks); + int y = rem / row_blocks; + int x = rem - (y * row_blocks); + + unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16; + const uint8_t* bp = data + offset; + physical_compressed_block pcb = *(const physical_compressed_block*) bp; + symbolic_compressed_block scb; + + physical_to_symbolic(*ctx->bsd, pcb, scb); + + decompress_symbolic_block( + ctx->config.profile, *ctx->bsd, x * block_x, y * block_y, z * block_z, scb, blk); + + write_image_block(image_out, blk, *ctx->bsd, x * block_x, y * block_y, z * block_z, swz_decode); + } + + ctx->manage_decompress.complete_task_assignment(count); + } + } + } + + std::deque _taskQueue; + astc_decompress_mutex_t _taskQueueMtx; + std::condition_variable_any _taskQueueCV; + + std::vector _threads; + bool _stopped = false; +}; +int astc_decompress_job_manager::s_thread_count = 1; +#endif + +uint8_t astc_decompress_image(const uint8_t* in, uint32_t inlen, uint8_t* out, uint32_t dim_x, uint32_t dim_y, + uint32_t block_x, uint32_t block_y) { + + auto start = yasio::highp_clock(); + +#if defined(ASTC_ENABLE_PARALLEL_DECOMPRESS) + astc_decompress_job_manager::get_instance()->decompress_parallel_sync(in, out, dim_x, dim_y, block_x, block_y); +#else static std::once_flag once_flag; std::call_once(once_flag, init_quant_mode_table); @@ -52,13 +246,16 @@ uint8_t astc_decompress_image( physical_to_symbolic(*bsd, pcb, scb); - decompress_symbolic_block(ASTCENC_PRF_LDR, *bsd, x * block_x, y * block_y, z * block_z, scb, blk); + decompress_symbolic_block(ASTCENC_PRF_LDR_SRGB, *bsd, x * block_x, y * block_y, z * block_z, scb, blk); write_image_block(image_out, blk, *bsd, x * block_x, y * block_y, z * block_z, swz_decode); } term_block_size_descriptor(*bsd); delete bsd; +#endif + + // CCLOG("decompress astc image cost: %.3lf(ms)", (yasio::highp_clock() - start) / (float) std::milli::den); return ASTCENC_SUCCESS; }