QuaternionEngine/third_party/fastgltf/src/base64.cpp

/*
 * Copyright (C) 2022 - 2023 spnda
 * This file is part of fastgltf <https://github.com/spnda/fastgltf>.
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#if !defined(__cplusplus) || (!defined(_MSVC_LANG) && __cplusplus < 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG < 201703L)
#error "fastgltf requires C++17"
#endif

#include <array>
#include <cmath>
#include <functional>

#include "simdjson.h"

#include <fastgltf/base64.hpp>

#if defined(FASTGLTF_IS_X86)
#if defined(__clang__) || defined(__GNUC__)
// The idea behind manually including all headers with the required intrinsics
// is that the usual intrin.h will only include these under Clang when -mavx or
// -mavx2 is specified, which in turn would have the entire program be compiled
// with these instructions used in optimisations.
#include <immintrin.h>
#include <smmintrin.h>
#include <avxintrin.h>
#include <avx2intrin.h>
#else
#include <intrin.h>
#endif
#elif defined(FASTGLTF_IS_A64)
#include <arm_neon.h> // Includes arm64_neon.h on MSVC
#endif

#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 5030)
#endif

namespace fg = fastgltf;

#if defined(_MSC_VER)
#define FORCEINLINE __forceinline
#else
// On other compilers we need the inline specifier, so that the functions in this compilation unit
// can be properly inlined without the "function body can be overwritten at link time" error.
#define FORCEINLINE inline
#endif

namespace fastgltf::base64 {
    using DecodeFunctionInplace = std::function<void(std::string_view, std::uint8_t*, std::size_t)>;
    using DecodeFunction = std::function<std::vector<std::uint8_t>(std::string_view)>;

    struct DecodeFunctionGetter {
        DecodeFunction func;
        DecodeFunctionInplace inplace;

        explicit DecodeFunctionGetter() {
            // We use simdjson's helper functions to determine which SIMD intrinsics are available at runtime.
            // The different implementations, because they're SIMD based, require a minimum amount of chars, as
            // they load multiple at once.
            const auto& impls = simdjson::get_available_implementations();
#if defined(FASTGLTF_IS_X86)
            if (const auto* avx2 = impls["haswell"]; avx2 != nullptr && avx2->supported_by_runtime_system()) {
                func = avx2_decode;
                inplace = avx2_decode_inplace;
            } else if (const auto* sse4 = impls["westmere"]; sse4 != nullptr && sse4->supported_by_runtime_system()) {
                func = sse4_decode;
                inplace = sse4_decode_inplace;
            }
#elif defined(FASTGLTF_IS_A64)
            // _M_ARM64 always guarantees 64-bit ARM processors that support NEON, defined by MSVC.
            // __aarch64__ always guarantees 64-bit ARMv8 processors that support NEON, defined by Clang.
            // __ARM_NEON always guarantees NEON support, defined by Clang and GCC.
            if (const auto* neon = impls["arm64"]; neon && neon->supported_by_runtime_system()) {
                func = neon_decode;
                inplace = neon_decode_inplace;
            }
#else
            if (false) {}
#endif
            else {
                func = fallback_decode;
                inplace = fallback_decode_inplace;
            }
        }

        static DecodeFunctionGetter* get() {
            static DecodeFunctionGetter getter;
            return &getter;
        }
    };
} // namespace fastgltf::base64

#if defined(FASTGLTF_IS_X86)
// The AVX and SSE decoding functions are based on http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html.
// It covers various methods of en-/decoding base64 using SSE and AVX and also shows their
// performance metrics.
// TODO: Mark these functions with msvc::forceinline which is available from C++20
[[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_lookup_pshufb_bitmask(const __m256i input) {
    const auto higher_nibble = _mm256_and_si256(_mm256_srli_epi32(input, 4), _mm256_set1_epi8(0x0f));

    const auto shiftLUT = _mm256_setr_epi8(
        0,   0,  19,   4, -65, -65, -71, -71,
        0,   0,   0,   0,   0,   0,   0,   0,

        0,   0,  19,   4, -65, -65, -71, -71,
        0,   0,   0,   0,   0,   0,   0,   0);

    const auto sh     = _mm256_shuffle_epi8(shiftLUT,  higher_nibble);
    const auto eq_2f  = _mm256_cmpeq_epi8(input, _mm256_set1_epi8(0x2f));
    const auto shift  = _mm256_blendv_epi8(sh, _mm256_set1_epi8(16), eq_2f);

    return _mm256_add_epi8(input, shift);
}

[[gnu::target("avx2"), gnu::always_inline]] FORCEINLINE auto avx2_pack_ints(__m256i input) {
    const auto merge = _mm256_maddubs_epi16(input, _mm256_set1_epi32(0x01400140));
    return _mm256_madd_epi16(merge, _mm256_set1_epi32(0x00011000));
}

[[gnu::target("avx2")]] void fg::base64::avx2_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) {
    constexpr auto dataSetSize = 32;
    constexpr auto dataOutputSize = 24;

    if (encoded.size() < dataSetSize) {
        fallback_decode_inplace(encoded, output, padding);
        return;
    }

    // We align the size to the highest size divisible by 32. By doing this, we don't need to
    // allocate any new memory to hold the encoded data and let the fallback decoder decode the
    // remaining data.
    const auto encodedSize = encoded.size();
    const auto outputSize = getOutputSize(encodedSize, padding);
    const auto alignedSize = outputSize - (outputSize % dataOutputSize);
    auto* out = output;

    // _mm256_setr_epi8 accepts only 'char' but 0xff would overflow a signed char.
    // This gets optimised to the same assembly as a call to the aformentioned intrinsic.
    static const std::array<std::uint8_t, 32> shuffleData = {{
        2,  1,  0,
        6,  5,  4,
        10,  9,  8,
        14, 13, 12,
        0xff, 0xff, 0xff, 0xff,
        2,  1,  0,
        6,  5,  4,
        10,  9,  8,
        14, 13, 12,
        0xff, 0xff, 0xff, 0xff
    }};
    __m256i shuffle;
    std::memcpy(&shuffle, shuffleData.data(), shuffleData.size());

    std::size_t pos = 0;
    while ((pos + dataSetSize) < alignedSize) {
        auto in = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&encoded[pos]));
        auto values = avx2_lookup_pshufb_bitmask(in);
        const auto merged = avx2_pack_ints(values);
        const auto shuffled = _mm256_shuffle_epi8(merged, shuffle);

        // Beware: This writes 32 bytes, we just discard the top 8 bytes.
        _mm_storeu_si128(reinterpret_cast<__m128i*>(out), _mm256_castsi256_si128(shuffled));
        _mm_storeu_si128(reinterpret_cast<__m128i*>(out + (dataOutputSize / 2)), _mm256_extracti128_si256(shuffled, 1));

        out += dataOutputSize;
        pos += dataSetSize;
    }

    // Decode the last chunk traditionally
    fallback_decode_inplace(encoded.substr(pos, encodedSize), out, padding);
}

[[gnu::target("avx2")]] std::vector<std::uint8_t> fg::base64::avx2_decode(std::string_view encoded) {
    const auto encodedSize = encoded.size();
    const auto padding = getPadding(encoded);

    std::vector<std::uint8_t> ret(getOutputSize(encodedSize, padding));
    avx2_decode_inplace(encoded, ret.data(), padding);

    return ret;
}

[[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_lookup_pshufb_bitmask(const __m128i input) {
    const auto higher_nibble = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));

    const auto shiftLUT = _mm_setr_epi8(
        0,   0,  19,   4, -65, -65, -71, -71,
        0,   0,   0,   0,   0,   0,   0,   0);

    const auto sh     = _mm_shuffle_epi8(shiftLUT,  higher_nibble);
    const auto eq_2f  = _mm_cmpeq_epi8(input, _mm_set1_epi8(0x2f));
    const auto shift  = _mm_blendv_epi8(sh, _mm_set1_epi8(16), eq_2f);

    return _mm_add_epi8(input, shift);
}

[[gnu::target("sse4.1"), gnu::always_inline]] FORCEINLINE auto sse4_pack_ints(__m128i input) {
    const auto merge = _mm_maddubs_epi16(input, _mm_set1_epi32(0x01400140));
    return _mm_madd_epi16(merge, _mm_set1_epi32(0x00011000));
}

[[gnu::target("sse4.1")]] void fg::base64::sse4_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) {
    constexpr auto dataSetSize = 16;
    constexpr auto dataOutputSize = 12;

    if (encoded.size() < dataSetSize) {
        fallback_decode_inplace(encoded, output, padding);
        return;
    }

    // We align the size to the highest size divisible by 16. By doing this, we don't need to
    // allocate any new memory to hold the encoded data and let the fallback decoder decode the
    // remaining data.
    const auto encodedSize = encoded.size();
    const auto outputSize = getOutputSize(encodedSize, padding);
    const auto alignedSize = outputSize - (outputSize % dataOutputSize);
    auto* out = output;

    // _mm_setr_epi8 accepts only 'char' but 0xff would overflow a signed char.
    // This gets optimised to the same assembly as a call to the aformentioned intrinsic.
    static const std::array<std::uint8_t, 16> shuffleData = {{
        2,  1,  0,
        6,  5,  4,
        10,  9,  8,
        14, 13, 12,
        0xff, 0xff, 0xff, 0xff,
    }};
    __m128i shuffle;
    std::memcpy(&shuffle, shuffleData.data(), shuffleData.size());

    std::size_t pos = 0;
    while ((pos + dataSetSize) < alignedSize) {
        auto in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&encoded[pos]));
        auto values = sse4_lookup_pshufb_bitmask(in);
        const auto merged = sse4_pack_ints(values);
        const auto shuffled = _mm_shuffle_epi8(merged, shuffle);

        // Beware: This writes 16 bytes, we just discard the top 4 bytes.
        _mm_storeu_si128(reinterpret_cast<__m128i*>(out), shuffled);

        out += dataOutputSize;
        pos += dataSetSize;
    }

    // Decode the last chunk traditionally
    fallback_decode_inplace(encoded.substr(pos, encodedSize), out, padding);
}

[[gnu::target("sse4.1")]] std::vector<std::uint8_t> fg::base64::sse4_decode(std::string_view encoded) {
    const auto encodedSize = encoded.size();
    const auto padding = getPadding(encoded);

    std::vector<std::uint8_t> ret(getOutputSize(encodedSize, padding));
    sse4_decode_inplace(encoded, ret.data(), padding);

    return ret;
}
#elif defined(FASTGLTF_IS_A64)
[[gnu::always_inline]] FORCEINLINE int8x16_t neon_lookup_pshufb_bitmask(const uint8x16_t input) {
    // clang-format off
    constexpr std::array<int8_t, 16> shiftLUTdata = {
        0,   0,  19,   4, -65, -65, -71, -71,
        0,   0,   0,   0,   0,   0,   0,   0
    };
    // clang-fomat on

    const uint64x2_t higher_nibble = vandq_s32(vshlq_u32(vreinterpretq_u32_u8(input), vdupq_n_s32(-4)), vdupq_n_s8(0x0f));

    const int8x16_t shiftLUT = vld1q_s8(shiftLUTdata.data());

    const int8x16_t sh = vqtbl1q_s8(shiftLUT, vandq_u8(higher_nibble, vdupq_n_u8(0x8F)));
    const uint8x16_t eq_2f = vceqq_s8(input, vdupq_n_s8(0x2F));
    const uint8x16_t shift = vbslq_u8(vshrq_n_s8(eq_2f, 7), vdupq_n_s8(16), sh);

    return vaddq_s8(input, shift);
}

[[gnu::always_inline]] FORCEINLINE int16x8_t neon_pack_ints(const int8x16_t input) {
    const uint32x4_t mask = vdupq_n_u32(0x01400140);

    const int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input))), vmovl_s8(vget_low_s8(mask)));
    const int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input))), vmovl_s8(vget_high_s8(mask)));
    const int16x8_t merge = vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th));

    // Multiply the 8 signed 16-bit integers from a and b and add the n and n + 1 results together,
    // resulting in 4 32-bit integers.
    const uint32x4_t mergeMask = vdupq_n_u32(0x00011000);
    const int32x4_t pl = vmull_s16(vget_low_s16(merge), vget_low_s16(mergeMask));
    const int32x4_t ph = vmull_high_s16(merge, mergeMask);
    return vpaddq_s32(pl, ph);
}

// clang-format off
[[gnu::aligned(16)]] static constexpr std::array<uint8_t, 16> shuffleData = {
        2,  1,  0,
        6,  5,  4,
        10,  9,  8,
        14, 13, 12,
        0xff, 0xff, 0xff, 0xff
};
// clang-fomat on

void fg::base64::neon_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) {
    constexpr auto dataSetSize = 16;
    constexpr auto dataOutputSize = 12;

    if (encoded.size() < dataSetSize) {
        fallback_decode_inplace(encoded, output, padding);
        return;
    }

    // We align the size to the highest size divisible by 16. By doing this, we don't need to
    // allocate any new memory to hold the encoded data and let the fallback decoder decode the
    // remaining data.
    const auto encodedSize = encoded.size();
    const auto alignedSize = encodedSize - (encodedSize % dataSetSize);
    auto* out = output;

    // Decode the first 16 long chunks with Neon intrinsics
    const auto shuffle = vld1q_u8(shuffleData.data());
    std::size_t pos = 0;
    while ((pos + dataSetSize) < alignedSize) {
        // Load 16 8-bit values into a 128-bit register.
        auto in = vld1q_u8(reinterpret_cast<const std::uint8_t*>(&encoded[pos]));
        auto values = neon_lookup_pshufb_bitmask(in);
        const auto merged = neon_pack_ints(values);

        const auto masked = vandq_u8(shuffle, vdupq_n_u8(0x8F));
        const auto shuffled = vqtbl1q_s8(merged, masked);

        // Store 16 8-bit values into output pointer
        vst1q_u8(out, shuffled);
        out += dataOutputSize;
        pos += dataSetSize;
    }

    // Decode the last chunk traditionally
    fallback_decode_inplace(encoded.substr(pos, encodedSize), out, padding);
}

std::vector<std::uint8_t> fg::base64::neon_decode(std::string_view encoded) {
    const auto encodedSize = encoded.size();
    const auto padding = getPadding(encoded);

    std::vector<std::uint8_t> ret(getOutputSize(encodedSize, padding));
    neon_decode_inplace(encoded, ret.data(), padding);

    return ret;
}
#endif

// clang-format off
// ASCII value -> base64 value LUT
static constexpr std::array<std::uint8_t, 128> base64lut = {
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62,0,0,0,63,
    52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
    0,0,0,0,0,0,0,
    0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
    0,0,0,0,0,0,
    26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,
    0,0,0,0,0,
};
// clang-format on

namespace fastgltf::base64 {
	[[gnu::always_inline]] FORCEINLINE void decode_block(std::array<std::uint8_t, 4>& sixBitChars, std::uint8_t* output) {
		for (std::size_t i = 0; i < 4; i++) {
			assert(static_cast<std::size_t>(sixBitChars[i]) < base64lut.size());
			sixBitChars[i] = base64lut[sixBitChars[i]];
		}

		output[0] = (sixBitChars[0] << 2) + ((sixBitChars[1] & 0x30) >> 4);
		output[1] = ((sixBitChars[1] & 0xf) << 4) + ((sixBitChars[2] & 0x3c) >> 2);
		output[2] = ((sixBitChars[2] & 0x3) << 6) + sixBitChars[3];
	}
} // namespace fastgltf::base64

void fg::base64::fallback_decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) {
    constexpr std::size_t blockSize = 4 * sizeof(char);
	std::array<std::uint8_t, 4> sixBitChars = {};

    // We use i here to track how many we've parsed and to batch 4 chars together.
    const auto encodedSize = encoded.size();
    std::size_t cursor = 0U;
    for (auto pos = 0U; pos + 4 < encodedSize; pos += 4) {
		std::memcpy(sixBitChars.data(), &encoded[pos], blockSize);

		decode_block(sixBitChars, &output[cursor]);
		cursor += 3;
    }

	// Decode the last (possibly) padded characters
	std::memcpy(sixBitChars.data(), &encoded[encodedSize - 4], blockSize);

	std::array<std::uint8_t, 4> eightBitChars = {};
	decode_block(sixBitChars, eightBitChars.data());

	// Write the last characters, making sure not to write over the end.
	const std::size_t charsToWrite = 3 - padding;
	for (std::size_t j = 0; j < charsToWrite; ++j) {
		output[cursor++] = eightBitChars[j];
	}
}

std::vector<std::uint8_t> fg::base64::fallback_decode(std::string_view encoded) {
    const auto encodedSize = encoded.size();
    const auto padding = getPadding(encoded);

    std::vector<std::uint8_t> ret(getOutputSize(encodedSize, padding));
    fallback_decode_inplace(encoded, ret.data(), padding);

    return ret;
}

void fg::base64::decode_inplace(std::string_view encoded, std::uint8_t* output, std::size_t padding) {
    assert(encoded.size() % 4 == 0);

    return DecodeFunctionGetter::get()->inplace(encoded, output, padding);
}

std::vector<std::uint8_t> fg::base64::decode(std::string_view encoded) {
    assert(encoded.size() % 4 == 0);

    return DecodeFunctionGetter::get()->func(encoded);
}

#ifdef _MSC_VER
#pragma warning(pop)
#endif