/* * Copyright (C) 2022 - 2023 spnda * This file is part of fastgltf . * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #if !defined(__cplusplus) || (!defined(_MSVC_LANG) && __cplusplus < 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG < 201703L) #error "fastgltf requires C++17" #endif #include #include #include #include "simdjson.h" #include #if defined(FASTGLTF_IS_X86) #if defined(__clang__) || defined(__GNUC__) // The idea behind manually including all headers with the required intrinsics // is that the usual intrin.h will only include these under Clang when -mavx or // -mavx2 is specified, which in turn would have the entire program be compiled // with these instructions used in optimisations. #include #include #include #include #else #include #endif #elif defined(FASTGLTF_IS_A64) #include // Includes arm64_neon.h on MSVC #endif #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable : 5030) #endif namespace fg = fastgltf; #if defined(_MSC_VER) #define FORCEINLINE __forceinline #else // On other compilers we need the inline specifier, so that the functions in this compilation unit // can be properly inlined without the "function body can be overwritten at link time" error. #define FORCEINLINE inline #endif namespace fastgltf::base64 { using DecodeFunctionInplace = std::function; using DecodeFunction = std::function(std::string_view)>; struct DecodeFunctionGetter { DecodeFunction func; DecodeFunctionInplace inplace; explicit DecodeFunctionGetter() { // We use simdjson's helper functions to determine which SIMD intrinsics are available at runtime. // The different implementations, because they're SIMD based, require a minimum amount of chars, as // they load multiple at once. const auto& impls = simdjson::get_available_implementations(); #if defined(FASTGLTF_IS_X86) if (const auto* avx2 = impls["haswell"]; avx2 != nullptr && avx2->supported_by_runtime_system()) { func = avx2_decode; inplace = avx2_decode_inplace; } else if (const auto* sse4 = impls["westmere"]; sse4 != nullptr && sse4->supported_by_runtime_system()) { func = sse4_decode; inplace = sse4_decode_inplace; } #elif defined(FASTGLTF_IS_A64) // _M_ARM64 always guarantees 64-bit ARM processors that support NEON, defined by MSVC. // __aarch64__ always guarantees 64-bit ARMv8 processors that support NEON, defined by Clang. // __ARM_NEON always guarantees NEON support, defined by Clang and GCC. if (const auto* neon = impls["arm64"]; neon && neon->supported_by_runtime_system()) { func = neon_decode; inplace = neon_decode_inplace; } #else if (false) {} #endif else { func = fallback_decode; inplace = fallback_decode_inplace; } } static DecodeFunctionGetter* get() { static DecodeFunctionGetter getter; return &getter; } }; } // namespace fastgltf::base64 #if defined(FASTGLTF_IS_X86) // The AVX and SSE decoding functions are based on http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html. // It covers various methods of en-/decoding base64 using SSE and AVX and also shows their // performance metrics. // TODO: Mark these functions with msvc::forceinline which is available from C++20 [[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_lookup_pshufb_bitmask(const __m256i input) { const auto higher_nibble = _mm256_and_si256(_mm256_srli_epi32(input, 4), _mm256_set1_epi8(0x0f)); const auto shiftLUT = _mm256_setr_epi8( 0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0); const auto sh = _mm256_shuffle_epi8(shiftLUT, higher_nibble); const auto eq_2f = _mm256_cmpeq_epi8(input, _mm256_set1_epi8(0x2f)); const auto shift = _mm256_blendv_epi8(sh, _mm256_set1_epi8(16), eq_2f); return _mm256_add_epi8(input, shift); } [[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_pack_ints(__m256i input) { const auto merge = _mm256_maddubs_epi16(input, _mm256_set1_epi32(0x01400140)); return _mm256_madd_epi16(merge, _mm256_set1_epi32(0x00011000)); } [[gnu::target("avx2")]] void fg::base64::avx2_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) { constexpr auto dataSetSize = 32; constexpr auto dataOutputSize = 24; if (encoded.size() < dataSetSize) { fallback_decode_inplace(encoded, output, padding); return; } // We align the size to the highest size divisible by 32. By doing this, we don't need to // allocate any new memory to hold the encoded data and let the fallback decoder decode the // remaining data. const auto encodedSize = encoded.size(); const auto outputSize = getOutputSize(encodedSize, padding); const auto alignedSize = outputSize - (outputSize % dataOutputSize); auto* out = output; // _mm256_setr_epi8 accepts only 'char' but 0xff would overflow a signed char. // This gets optimised to the same assembly as a call to the aformentioned intrinsic. static const std::array shuffleData = {{ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0xff, 0xff, 0xff, 0xff, 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0xff, 0xff, 0xff, 0xff }}; __m256i shuffle; std::memcpy(&shuffle, shuffleData.data(), shuffleData.size()); std::size_t pos = 0; while ((pos + dataSetSize) < alignedSize) { auto in = _mm256_loadu_si256(reinterpret_cast(&encoded[pos])); auto values = avx2_lookup_pshufb_bitmask(in); const auto merged = avx2_pack_ints(values); const auto shuffled = _mm256_shuffle_epi8(merged, shuffle); // Beware: This writes 32 bytes, we just discard the top 8 bytes. _mm_storeu_si128(reinterpret_cast<__m128i*>(out), _mm256_castsi256_si128(shuffled)); _mm_storeu_si128(reinterpret_cast<__m128i*>(out + (dataOutputSize / 2)), _mm256_extracti128_si256(shuffled, 1)); out += dataOutputSize; pos += dataSetSize; } // Decode the last chunk traditionally fallback_decode_inplace(encoded.substr(pos, encodedSize), out, padding); } [[gnu::target("avx2")]] std::vector fg::base64::avx2_decode(std::string_view encoded) { const auto encodedSize = encoded.size(); const auto padding = getPadding(encoded); std::vector ret(getOutputSize(encodedSize, padding)); avx2_decode_inplace(encoded, ret.data(), padding); return ret; } [[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_lookup_pshufb_bitmask(const __m128i input) { const auto higher_nibble = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f)); const auto shiftLUT = _mm_setr_epi8( 0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0); const auto sh = _mm_shuffle_epi8(shiftLUT, higher_nibble); const auto eq_2f = _mm_cmpeq_epi8(input, _mm_set1_epi8(0x2f)); const auto shift = _mm_blendv_epi8(sh, _mm_set1_epi8(16), eq_2f); return _mm_add_epi8(input, shift); } [[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_pack_ints(__m128i input) { const auto merge = _mm_maddubs_epi16(input, _mm_set1_epi32(0x01400140)); return _mm_madd_epi16(merge, _mm_set1_epi32(0x00011000)); } [[gnu::target("sse4.1")]] void fg::base64::sse4_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) { constexpr auto dataSetSize = 16; constexpr auto dataOutputSize = 12; if (encoded.size() < dataSetSize) { fallback_decode_inplace(encoded, output, padding); return; } // We align the size to the highest size divisible by 16. By doing this, we don't need to // allocate any new memory to hold the encoded data and let the fallback decoder decode the // remaining data. const auto encodedSize = encoded.size(); const auto outputSize = getOutputSize(encodedSize, padding); const auto alignedSize = outputSize - (outputSize % dataOutputSize); auto* out = output; // _mm_setr_epi8 accepts only 'char' but 0xff would overflow a signed char. // This gets optimised to the same assembly as a call to the aformentioned intrinsic. static const std::array shuffleData = {{ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0xff, 0xff, 0xff, 0xff, }}; __m128i shuffle; std::memcpy(&shuffle, shuffleData.data(), shuffleData.size()); std::size_t pos = 0; while ((pos + dataSetSize) < alignedSize) { auto in = _mm_loadu_si128(reinterpret_cast(&encoded[pos])); auto values = sse4_lookup_pshufb_bitmask(in); const auto merged = sse4_pack_ints(values); const auto shuffled = _mm_shuffle_epi8(merged, shuffle); // Beware: This writes 16 bytes, we just discard the top 4 bytes. _mm_storeu_si128(reinterpret_cast<__m128i*>(out), shuffled); out += dataOutputSize; pos += dataSetSize; } // Decode the last chunk traditionally fallback_decode_inplace(encoded.substr(pos, encodedSize), out, padding); } [[gnu::target("sse4.1")]] std::vector fg::base64::sse4_decode(std::string_view encoded) { const auto encodedSize = encoded.size(); const auto padding = getPadding(encoded); std::vector ret(getOutputSize(encodedSize, padding)); sse4_decode_inplace(encoded, ret.data(), padding); return ret; } #elif defined(FASTGLTF_IS_A64) [[gnu::always_inline]] FORCEINLINE int8x16_t neon_lookup_pshufb_bitmask(const uint8x16_t input) { // clang-format off constexpr std::array shiftLUTdata = { 0, 0, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0 }; // clang-fomat on const uint64x2_t higher_nibble = vandq_s32(vshlq_u32(vreinterpretq_u32_u8(input), vdupq_n_s32(-4)), vdupq_n_s8(0x0f)); const int8x16_t shiftLUT = vld1q_s8(shiftLUTdata.data()); const int8x16_t sh = vqtbl1q_s8(shiftLUT, vandq_u8(higher_nibble, vdupq_n_u8(0x8F))); const uint8x16_t eq_2f = vceqq_s8(input, vdupq_n_s8(0x2F)); const uint8x16_t shift = vbslq_u8(vshrq_n_s8(eq_2f, 7), vdupq_n_s8(16), sh); return vaddq_s8(input, shift); } [[gnu::always_inline]] FORCEINLINE int16x8_t neon_pack_ints(const int8x16_t input) { const uint32x4_t mask = vdupq_n_u32(0x01400140); const int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input))), vmovl_s8(vget_low_s8(mask))); const int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input))), vmovl_s8(vget_high_s8(mask))); const int16x8_t merge = vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)); // Multiply the 8 signed 16-bit integers from a and b and add the n and n + 1 results together, // resulting in 4 32-bit integers. const uint32x4_t mergeMask = vdupq_n_u32(0x00011000); const int32x4_t pl = vmull_s16(vget_low_s16(merge), vget_low_s16(mergeMask)); const int32x4_t ph = vmull_high_s16(merge, mergeMask); return vpaddq_s32(pl, ph); } // clang-format off [[gnu::aligned(16)]] static constexpr std::array shuffleData = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 0xff, 0xff, 0xff, 0xff }; // clang-fomat on void fg::base64::neon_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) { constexpr auto dataSetSize = 16; constexpr auto dataOutputSize = 12; if (encoded.size() < dataSetSize) { fallback_decode_inplace(encoded, output, padding); return; } // We align the size to the highest size divisible by 16. By doing this, we don't need to // allocate any new memory to hold the encoded data and let the fallback decoder decode the // remaining data. const auto encodedSize = encoded.size(); const auto alignedSize = encodedSize - (encodedSize % dataSetSize); auto* out = output; // Decode the first 16 long chunks with Neon intrinsics const auto shuffle = vld1q_u8(shuffleData.data()); std::size_t pos = 0; while ((pos + dataSetSize) < alignedSize) { // Load 16 8-bit values into a 128-bit register. auto in = vld1q_u8(reinterpret_cast(&encoded[pos])); auto values = neon_lookup_pshufb_bitmask(in); const auto merged = neon_pack_ints(values); const auto masked = vandq_u8(shuffle, vdupq_n_u8(0x8F)); const auto shuffled = vqtbl1q_s8(merged, masked); // Store 16 8-bit values into output pointer vst1q_u8(out, shuffled); out += dataOutputSize; pos += dataSetSize; } // Decode the last chunk traditionally fallback_decode_inplace(encoded.substr(pos, encodedSize), out, padding); } std::vector fg::base64::neon_decode(std::string_view encoded) { const auto encodedSize = encoded.size(); const auto padding = getPadding(encoded); std::vector ret(getOutputSize(encodedSize, padding)); neon_decode_inplace(encoded, ret.data(), padding); return ret; } #endif // clang-format off // ASCII value -> base64 value LUT static constexpr std::array base64lut = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62,0,0,0,63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25, 0,0,0,0,0,0, 26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51, 0,0,0,0,0, }; // clang-format on namespace fastgltf::base64 { [[gnu::always_inline]] FORCEINLINE void decode_block(std::array& sixBitChars, std::uint8_t* output) { for (std::size_t i = 0; i < 4; i++) { assert(static_cast(sixBitChars[i]) < base64lut.size()); sixBitChars[i] = base64lut[sixBitChars[i]]; } output[0] = (sixBitChars[0] << 2) + ((sixBitChars[1] & 0x30) >> 4); output[1] = ((sixBitChars[1] & 0xf) << 4) + ((sixBitChars[2] & 0x3c) >> 2); output[2] = ((sixBitChars[2] & 0x3) << 6) + sixBitChars[3]; } } // namespace fastgltf::base64 void fg::base64::fallback_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) { constexpr std::size_t blockSize = 4 * sizeof(char); std::array sixBitChars = {}; // We use i here to track how many we've parsed and to batch 4 chars together. const auto encodedSize = encoded.size(); std::size_t cursor = 0U; for (auto pos = 0U; pos + 4 < encodedSize; pos += 4) { std::memcpy(sixBitChars.data(), &encoded[pos], blockSize); decode_block(sixBitChars, &output[cursor]); cursor += 3; } // Decode the last (possibly) padded characters std::memcpy(sixBitChars.data(), &encoded[encodedSize - 4], blockSize); std::array eightBitChars = {}; decode_block(sixBitChars, eightBitChars.data()); // Write the last characters, making sure not to write over the end. const std::size_t charsToWrite = 3 - padding; for (std::size_t j = 0; j < charsToWrite; ++j) { output[cursor++] = eightBitChars[j]; } } std::vector fg::base64::fallback_decode(std::string_view encoded) { const auto encodedSize = encoded.size(); const auto padding = getPadding(encoded); std::vector ret(getOutputSize(encodedSize, padding)); fallback_decode_inplace(encoded, ret.data(), padding); return ret; } void fg::base64::decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) { assert(encoded.size() % 4 == 0); return DecodeFunctionGetter::get()->inplace(encoded, output, padding); } std::vector fg::base64::decode(std::string_view encoded) { assert(encoded.size() % 4 == 0); return DecodeFunctionGetter::get()->func(encoded); } #ifdef _MSC_VER #pragma warning(pop) #endif