mirror of https://github.com/axmolengine/axmol.git
55923 lines
2.2 MiB
55923 lines
2.2 MiB
/* auto-generated on 2024-05-07 18:04:59 -0400. Do not edit! */
|
|
/* including simdjson.cpp: */
|
|
/* begin file simdjson.cpp */
|
|
#define SIMDJSON_SRC_SIMDJSON_CPP
|
|
|
|
/* including base.h: #include <base.h> */
|
|
/* begin file base.h */
|
|
#ifndef SIMDJSON_SRC_BASE_H
|
|
#define SIMDJSON_SRC_BASE_H
|
|
|
|
/* including simdjson/base.h: #include <simdjson/base.h> */
|
|
/* begin file simdjson/base.h */
|
|
/**
|
|
* @file Base declarations for all simdjson headers
|
|
* @private
|
|
*/
|
|
#ifndef SIMDJSON_BASE_H
|
|
#define SIMDJSON_BASE_H
|
|
|
|
/* including simdjson/common_defs.h: #include "simdjson/common_defs.h" */
|
|
/* begin file simdjson/common_defs.h */
|
|
#ifndef SIMDJSON_COMMON_DEFS_H
|
|
#define SIMDJSON_COMMON_DEFS_H
|
|
|
|
#include <cassert>
|
|
/* including simdjson/compiler_check.h: #include "simdjson/compiler_check.h" */
|
|
/* begin file simdjson/compiler_check.h */
|
|
#ifndef SIMDJSON_COMPILER_CHECK_H
|
|
#define SIMDJSON_COMPILER_CHECK_H
|
|
|
|
#ifndef __cplusplus
|
|
#error simdjson requires a C++ compiler
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_CPLUSPLUS
|
|
#if defined(_MSVC_LANG) && !defined(__clang__)
|
|
#define SIMDJSON_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG)
|
|
#else
|
|
#define SIMDJSON_CPLUSPLUS __cplusplus
|
|
#endif
|
|
#endif
|
|
|
|
// C++ 17
|
|
#if !defined(SIMDJSON_CPLUSPLUS17) && (SIMDJSON_CPLUSPLUS >= 201703L)
|
|
#define SIMDJSON_CPLUSPLUS17 1
|
|
#endif
|
|
|
|
// C++ 14
|
|
#if !defined(SIMDJSON_CPLUSPLUS14) && (SIMDJSON_CPLUSPLUS >= 201402L)
|
|
#define SIMDJSON_CPLUSPLUS14 1
|
|
#endif
|
|
|
|
// C++ 11
|
|
#if !defined(SIMDJSON_CPLUSPLUS11) && (SIMDJSON_CPLUSPLUS >= 201103L)
|
|
#define SIMDJSON_CPLUSPLUS11 1
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_CPLUSPLUS11
|
|
#error simdjson requires a compiler compliant with the C++11 standard
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_IF_CONSTEXPR
|
|
#if SIMDJSON_CPLUSPLUS17
|
|
#define SIMDJSON_IF_CONSTEXPR if constexpr
|
|
#else
|
|
#define SIMDJSON_IF_CONSTEXPR if
|
|
#endif
|
|
#endif
|
|
|
|
#endif // SIMDJSON_COMPILER_CHECK_H
|
|
/* end file simdjson/compiler_check.h */
|
|
/* including simdjson/portability.h: #include "simdjson/portability.h" */
|
|
/* begin file simdjson/portability.h */
|
|
#ifndef SIMDJSON_PORTABILITY_H
|
|
#define SIMDJSON_PORTABILITY_H
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <cfloat>
|
|
#include <cassert>
|
|
#ifndef _WIN32
|
|
// strcasecmp, strncasecmp
|
|
#include <strings.h>
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
#define SIMDJSON_VISUAL_STUDIO 1
|
|
/**
|
|
* We want to differentiate carefully between
|
|
* clang under visual studio and regular visual
|
|
* studio.
|
|
*
|
|
* Under clang for Windows, we enable:
|
|
* * target pragmas so that part and only part of the
|
|
* code gets compiled for advanced instructions.
|
|
*
|
|
*/
|
|
#ifdef __clang__
|
|
// clang under visual studio
|
|
#define SIMDJSON_CLANG_VISUAL_STUDIO 1
|
|
#else
|
|
// just regular visual studio (best guess)
|
|
#define SIMDJSON_REGULAR_VISUAL_STUDIO 1
|
|
#endif // __clang__
|
|
#endif // _MSC_VER
|
|
|
|
#if (defined(__x86_64__) || defined(_M_AMD64)) && !defined(_M_ARM64EC)
|
|
#define SIMDJSON_IS_X86_64 1
|
|
#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
|
#define SIMDJSON_IS_ARM64 1
|
|
#elif defined(__riscv) && __riscv_xlen == 64
|
|
#define SIMDJSON_IS_RISCV64 1
|
|
#elif defined(__loongarch_lp64)
|
|
#define SIMDJSON_IS_LOONGARCH64 1
|
|
#elif defined(__PPC64__) || defined(_M_PPC64)
|
|
#if defined(__ALTIVEC__)
|
|
#define SIMDJSON_IS_PPC64_VMX 1
|
|
#endif // defined(__ALTIVEC__)
|
|
#else
|
|
#define SIMDJSON_IS_32BITS 1
|
|
|
|
#if defined(_M_IX86) || defined(__i386__)
|
|
#define SIMDJSON_IS_X86_32BITS 1
|
|
#elif defined(__arm__) || defined(_M_ARM)
|
|
#define SIMDJSON_IS_ARM_32BITS 1
|
|
#elif defined(__PPC__) || defined(_M_PPC)
|
|
#define SIMDJSON_IS_PPC_32BITS 1
|
|
#endif
|
|
|
|
#endif // defined(__x86_64__) || defined(_M_AMD64)
|
|
#ifndef SIMDJSON_IS_32BITS
|
|
#define SIMDJSON_IS_32BITS 0
|
|
#endif
|
|
|
|
#if SIMDJSON_IS_32BITS
|
|
#ifndef SIMDJSON_NO_PORTABILITY_WARNING
|
|
// In the future, we should allow programmers
|
|
// to get warning.
|
|
#endif // SIMDJSON_NO_PORTABILITY_WARNING
|
|
#endif // SIMDJSON_IS_32BITS
|
|
|
|
#define SIMDJSON_CAT_IMPLEMENTATION_(a,...) a ## __VA_ARGS__
|
|
#define SIMDJSON_CAT(a,...) SIMDJSON_CAT_IMPLEMENTATION_(a, __VA_ARGS__)
|
|
|
|
#define SIMDJSON_STRINGIFY_IMPLEMENTATION_(a,...) #a SIMDJSON_STRINGIFY(__VA_ARGS__)
|
|
#define SIMDJSON_STRINGIFY(a,...) SIMDJSON_CAT_IMPLEMENTATION_(a, __VA_ARGS__)
|
|
|
|
// this is almost standard?
|
|
#undef SIMDJSON_STRINGIFY_IMPLEMENTATION_
|
|
#undef SIMDJSON_STRINGIFY
|
|
#define SIMDJSON_STRINGIFY_IMPLEMENTATION_(a) #a
|
|
#define SIMDJSON_STRINGIFY(a) SIMDJSON_STRINGIFY_IMPLEMENTATION_(a)
|
|
|
|
// Our fast kernels require 64-bit systems.
|
|
//
|
|
// On 32-bit x86, we lack 64-bit popcnt, lzcnt, blsr instructions.
|
|
// Furthermore, the number of SIMD registers is reduced.
|
|
//
|
|
// On 32-bit ARM, we would have smaller registers.
|
|
//
|
|
// The simdjson users should still have the fallback kernel. It is
|
|
// slower, but it should run everywhere.
|
|
|
|
//
|
|
// Enable valid runtime implementations, and select SIMDJSON_BUILTIN_IMPLEMENTATION
|
|
//
|
|
|
|
// We are going to use runtime dispatch.
|
|
#if SIMDJSON_IS_X86_64
|
|
#ifdef __clang__
|
|
// clang does not have GCC push pop
|
|
// warning: clang attribute push can't be used within a namespace in clang up
|
|
// til 8.0 so SIMDJSON_TARGET_REGION and SIMDJSON_UNTARGET_REGION must be *outside* of a
|
|
// namespace.
|
|
#define SIMDJSON_TARGET_REGION(T) \
|
|
_Pragma(SIMDJSON_STRINGIFY( \
|
|
clang attribute push(__attribute__((target(T))), apply_to = function)))
|
|
#define SIMDJSON_UNTARGET_REGION _Pragma("clang attribute pop")
|
|
#elif defined(__GNUC__)
|
|
// GCC is easier
|
|
#define SIMDJSON_TARGET_REGION(T) \
|
|
_Pragma("GCC push_options") _Pragma(SIMDJSON_STRINGIFY(GCC target(T)))
|
|
#define SIMDJSON_UNTARGET_REGION _Pragma("GCC pop_options")
|
|
#endif // clang then gcc
|
|
|
|
#endif // x86
|
|
|
|
// Default target region macros don't do anything.
|
|
#ifndef SIMDJSON_TARGET_REGION
|
|
#define SIMDJSON_TARGET_REGION(T)
|
|
#define SIMDJSON_UNTARGET_REGION
|
|
#endif
|
|
|
|
// Is threading enabled?
|
|
#if defined(_REENTRANT) || defined(_MT)
|
|
#ifndef SIMDJSON_THREADS_ENABLED
|
|
#define SIMDJSON_THREADS_ENABLED
|
|
#endif
|
|
#endif
|
|
|
|
// workaround for large stack sizes under -O0.
|
|
// https://github.com/simdjson/simdjson/issues/691
|
|
#ifdef __APPLE__
|
|
#ifndef __OPTIMIZE__
|
|
// Apple systems have small stack sizes in secondary threads.
|
|
// Lack of compiler optimization may generate high stack usage.
|
|
// Users may want to disable threads for safety, but only when
|
|
// in debug mode which we detect by the fact that the __OPTIMIZE__
|
|
// macro is not defined.
|
|
#undef SIMDJSON_THREADS_ENABLED
|
|
#endif
|
|
#endif
|
|
|
|
|
|
#if defined(__clang__)
|
|
#define SIMDJSON_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined")))
|
|
#elif defined(__GNUC__)
|
|
#define SIMDJSON_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined))
|
|
#else
|
|
#define SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
#endif
|
|
|
|
|
|
#if defined(__clang__) || defined(__GNUC__)
|
|
#if defined(__has_feature)
|
|
# if __has_feature(memory_sanitizer)
|
|
#define SIMDJSON_NO_SANITIZE_MEMORY __attribute__((no_sanitize("memory")))
|
|
# endif // if __has_feature(memory_sanitizer)
|
|
#endif // defined(__has_feature)
|
|
#endif
|
|
// make sure it is defined as 'nothing' if it is unapplicable.
|
|
#ifndef SIMDJSON_NO_SANITIZE_MEMORY
|
|
#define SIMDJSON_NO_SANITIZE_MEMORY
|
|
#endif
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// This is one case where we do not distinguish between
|
|
// regular visual studio and clang under visual studio.
|
|
// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
|
|
#define simdjson_strcasecmp _stricmp
|
|
#define simdjson_strncasecmp _strnicmp
|
|
#else
|
|
// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
|
|
// So they are only useful for ASCII in our context.
|
|
// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
|
|
#define simdjson_strcasecmp strcasecmp
|
|
#define simdjson_strncasecmp strncasecmp
|
|
#endif
|
|
|
|
#if defined(NDEBUG) || defined(__OPTIMIZE__) || (defined(_MSC_VER) && !defined(_DEBUG))
|
|
// If NDEBUG is set, or __OPTIMIZE__ is set, or we are under MSVC in release mode,
|
|
// then do away with asserts and use __assume.
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
#define SIMDJSON_UNREACHABLE() __assume(0)
|
|
#define SIMDJSON_ASSUME(COND) __assume(COND)
|
|
#else
|
|
#define SIMDJSON_UNREACHABLE() __builtin_unreachable();
|
|
#define SIMDJSON_ASSUME(COND) do { if (!(COND)) __builtin_unreachable(); } while (0)
|
|
#endif
|
|
|
|
#else // defined(NDEBUG) || defined(__OPTIMIZE__) || (defined(_MSC_VER) && !defined(_DEBUG))
|
|
// This should only ever be enabled in debug mode.
|
|
#define SIMDJSON_UNREACHABLE() assert(0);
|
|
#define SIMDJSON_ASSUME(COND) assert(COND)
|
|
|
|
#endif
|
|
|
|
#endif // SIMDJSON_PORTABILITY_H
|
|
/* end file simdjson/portability.h */
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
/**
|
|
* @private
|
|
* Our own implementation of the C++17 to_chars function.
|
|
* Defined in src/to_chars
|
|
*/
|
|
char *to_chars(char *first, const char *last, double value);
|
|
/**
|
|
* @private
|
|
* A number parsing routine.
|
|
* Defined in src/from_chars
|
|
*/
|
|
double from_chars(const char *first) noexcept;
|
|
double from_chars(const char *first, const char* end) noexcept;
|
|
}
|
|
|
|
#ifndef SIMDJSON_EXCEPTIONS
|
|
#if __cpp_exceptions
|
|
#define SIMDJSON_EXCEPTIONS 1
|
|
#else
|
|
#define SIMDJSON_EXCEPTIONS 0
|
|
#endif
|
|
#endif
|
|
|
|
} // namespace simdjson
|
|
|
|
#if defined(__GNUC__)
|
|
// Marks a block with a name so that MCA analysis can see it.
|
|
#define SIMDJSON_BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
|
|
#define SIMDJSON_END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
|
|
#define SIMDJSON_DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
|
|
#else
|
|
#define SIMDJSON_BEGIN_DEBUG_BLOCK(name)
|
|
#define SIMDJSON_END_DEBUG_BLOCK(name)
|
|
#define SIMDJSON_DEBUG_BLOCK(name, block)
|
|
#endif
|
|
|
|
// Align to N-byte boundary
|
|
#define SIMDJSON_ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1))
|
|
#define SIMDJSON_ROUNDDOWN_N(a, n) ((a) & ~((n)-1))
|
|
|
|
#define SIMDJSON_ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0)
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
|
|
#define simdjson_really_inline __forceinline
|
|
#define simdjson_never_inline __declspec(noinline)
|
|
|
|
#define simdjson_unused
|
|
#define simdjson_warn_unused
|
|
|
|
#ifndef simdjson_likely
|
|
#define simdjson_likely(x) x
|
|
#endif
|
|
#ifndef simdjson_unlikely
|
|
#define simdjson_unlikely(x) x
|
|
#endif
|
|
|
|
#define SIMDJSON_PUSH_DISABLE_WARNINGS __pragma(warning( push ))
|
|
#define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS __pragma(warning( push, 0 ))
|
|
#define SIMDJSON_DISABLE_VS_WARNING(WARNING_NUMBER) __pragma(warning( disable : WARNING_NUMBER ))
|
|
// Get rid of Intellisense-only warnings (Code Analysis)
|
|
// Though __has_include is C++17, it is supported in Visual Studio 2017 or better (_MSC_VER>=1910).
|
|
#ifdef __has_include
|
|
#if __has_include(<CppCoreCheck\Warnings.h>)
|
|
#include <CppCoreCheck\Warnings.h>
|
|
#define SIMDJSON_DISABLE_UNDESIRED_WARNINGS SIMDJSON_DISABLE_VS_WARNING(ALL_CPPCORECHECK_WARNINGS)
|
|
#endif
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_DISABLE_UNDESIRED_WARNINGS
|
|
#define SIMDJSON_DISABLE_UNDESIRED_WARNINGS
|
|
#endif
|
|
|
|
#define SIMDJSON_DISABLE_DEPRECATED_WARNING SIMDJSON_DISABLE_VS_WARNING(4996)
|
|
#define SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING
|
|
#define SIMDJSON_POP_DISABLE_WARNINGS __pragma(warning( pop ))
|
|
|
|
#define SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS
|
|
#define SIMDJSON_POP_DISABLE_UNUSED_WARNINGS
|
|
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
|
|
#define simdjson_really_inline inline __attribute__((always_inline))
|
|
#define simdjson_never_inline inline __attribute__((noinline))
|
|
|
|
#define simdjson_unused __attribute__((unused))
|
|
#define simdjson_warn_unused __attribute__((warn_unused_result))
|
|
|
|
#ifndef simdjson_likely
|
|
#define simdjson_likely(x) __builtin_expect(!!(x), 1)
|
|
#endif
|
|
#ifndef simdjson_unlikely
|
|
#define simdjson_unlikely(x) __builtin_expect(!!(x), 0)
|
|
#endif
|
|
|
|
#define SIMDJSON_PUSH_DISABLE_WARNINGS _Pragma("GCC diagnostic push")
|
|
// gcc doesn't seem to disable all warnings with all and extra, add warnings here as necessary
|
|
// We do it separately for clang since it has different warnings.
|
|
#ifdef __clang__
|
|
// clang is missing -Wmaybe-uninitialized.
|
|
#define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Weffc++) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wall) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wconversion) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wextra) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wattributes) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wreturn-type) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wshadow) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wunused-parameter) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wunused-variable)
|
|
#else // __clang__
|
|
#define SIMDJSON_PUSH_DISABLE_ALL_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Weffc++) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wall) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wconversion) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wextra) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wattributes) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wimplicit-fallthrough) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wnon-virtual-dtor) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wreturn-type) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wshadow) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wunused-parameter) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wunused-variable) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wmaybe-uninitialized) \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wformat-security)
|
|
#endif // __clang__
|
|
|
|
#define SIMDJSON_PRAGMA(P) _Pragma(#P)
|
|
#define SIMDJSON_DISABLE_GCC_WARNING(WARNING) SIMDJSON_PRAGMA(GCC diagnostic ignored #WARNING)
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
#define SIMDJSON_DISABLE_UNDESIRED_WARNINGS SIMDJSON_DISABLE_GCC_WARNING(-Wmicrosoft-include)
|
|
#else
|
|
#define SIMDJSON_DISABLE_UNDESIRED_WARNINGS
|
|
#endif
|
|
#define SIMDJSON_DISABLE_DEPRECATED_WARNING SIMDJSON_DISABLE_GCC_WARNING(-Wdeprecated-declarations)
|
|
#define SIMDJSON_DISABLE_STRICT_OVERFLOW_WARNING SIMDJSON_DISABLE_GCC_WARNING(-Wstrict-overflow)
|
|
#define SIMDJSON_POP_DISABLE_WARNINGS _Pragma("GCC diagnostic pop")
|
|
|
|
#define SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS SIMDJSON_PUSH_DISABLE_WARNINGS \
|
|
SIMDJSON_DISABLE_GCC_WARNING(-Wunused)
|
|
#define SIMDJSON_POP_DISABLE_UNUSED_WARNINGS SIMDJSON_POP_DISABLE_WARNINGS
|
|
|
|
|
|
|
|
#endif // MSC_VER
|
|
|
|
#if defined(simdjson_inline)
|
|
// Prefer the user's definition of simdjson_inline; don't define it ourselves.
|
|
#elif defined(__GNUC__) && !defined(__OPTIMIZE__)
|
|
// If optimizations are disabled, forcing inlining can lead to significant
|
|
// code bloat and high compile times. Don't use simdjson_really_inline for
|
|
// unoptimized builds.
|
|
#define simdjson_inline inline
|
|
#else
|
|
// Force inlining for most simdjson functions.
|
|
#define simdjson_inline simdjson_really_inline
|
|
#endif
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
/**
|
|
* Windows users need to do some extra work when building
|
|
* or using a dynamic library (DLL). When building, we need
|
|
* to set SIMDJSON_DLLIMPORTEXPORT to __declspec(dllexport).
|
|
* When *using* the DLL, the user needs to set
|
|
* SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport).
|
|
*
|
|
* Static libraries not need require such work.
|
|
*
|
|
* It does not matter here whether you are using
|
|
* the regular visual studio or clang under visual
|
|
* studio, you still need to handle these issues.
|
|
*
|
|
* Non-Windows systems do not have this complexity.
|
|
*/
|
|
#if SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY
|
|
// We set SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY when we build a DLL under Windows.
|
|
// It should never happen that both SIMDJSON_BUILDING_WINDOWS_DYNAMIC_LIBRARY and
|
|
// SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY are set.
|
|
#define SIMDJSON_DLLIMPORTEXPORT __declspec(dllexport)
|
|
#elif SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY
|
|
// Windows user who call a dynamic library should set SIMDJSON_USING_WINDOWS_DYNAMIC_LIBRARY to 1.
|
|
#define SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport)
|
|
#else
|
|
// We assume by default static linkage
|
|
#define SIMDJSON_DLLIMPORTEXPORT
|
|
#endif
|
|
|
|
/**
|
|
* Workaround for the vcpkg package manager. Only vcpkg should
|
|
* ever touch the next line. The SIMDJSON_USING_LIBRARY macro is otherwise unused.
|
|
*/
|
|
#if SIMDJSON_USING_LIBRARY
|
|
#define SIMDJSON_DLLIMPORTEXPORT __declspec(dllimport)
|
|
#endif
|
|
/**
|
|
* End of workaround for the vcpkg package manager.
|
|
*/
|
|
#else
|
|
#define SIMDJSON_DLLIMPORTEXPORT
|
|
#endif
|
|
|
|
// C++17 requires string_view.
|
|
#if SIMDJSON_CPLUSPLUS17
|
|
#define SIMDJSON_HAS_STRING_VIEW
|
|
#include <string_view> // by the standard, this has to be safe.
|
|
#endif
|
|
|
|
// This macro (__cpp_lib_string_view) has to be defined
|
|
// for C++17 and better, but if it is otherwise defined,
|
|
// we are going to assume that string_view is available
|
|
// even if we do not have C++17 support.
|
|
#ifdef __cpp_lib_string_view
|
|
#define SIMDJSON_HAS_STRING_VIEW
|
|
#endif
|
|
|
|
// Some systems have string_view even if we do not have C++17 support,
|
|
// and even if __cpp_lib_string_view is undefined, it is the case
|
|
// with Apple clang version 11.
|
|
// We must handle it. *This is important.*
|
|
#ifndef SIMDJSON_HAS_STRING_VIEW
|
|
#if defined __has_include
|
|
// do not combine the next #if with the previous one (unsafe)
|
|
#if __has_include (<string_view>)
|
|
// now it is safe to trigger the include
|
|
#include <string_view> // though the file is there, it does not follow that we got the implementation
|
|
#if defined(_LIBCPP_STRING_VIEW)
|
|
// Ah! So we under libc++ which under its Library Fundamentals Technical Specification, which preceded C++17,
|
|
// included string_view.
|
|
// This means that we have string_view *even though* we may not have C++17.
|
|
#define SIMDJSON_HAS_STRING_VIEW
|
|
#endif // _LIBCPP_STRING_VIEW
|
|
#endif // __has_include (<string_view>)
|
|
#endif // defined __has_include
|
|
#endif // def SIMDJSON_HAS_STRING_VIEW
|
|
// end of complicated but important routine to try to detect string_view.
|
|
|
|
//
|
|
// Backfill std::string_view using nonstd::string_view on systems where
|
|
// we expect that string_view is missing. Important: if we get this wrong,
|
|
// we will end up with two string_view definitions and potential trouble.
|
|
// That is why we work so hard above to avoid it.
|
|
//
|
|
#ifndef SIMDJSON_HAS_STRING_VIEW
|
|
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
|
|
/* including simdjson/nonstd/string_view.hpp: #include "simdjson/nonstd/string_view.hpp" */
|
|
/* begin file simdjson/nonstd/string_view.hpp */
|
|
// Copyright 2017-2020 by Martin Moene
|
|
//
|
|
// string-view lite, a C++17-like string_view for C++98 and later.
|
|
// For more information see https://github.com/martinmoene/string-view-lite
|
|
//
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
// (See accompanying file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
// #pragma once // We remove #pragma once here as it generates a warning in some cases. We rely on the include guard.
|
|
|
|
#ifndef NONSTD_SV_LITE_H_INCLUDED
|
|
#define NONSTD_SV_LITE_H_INCLUDED
|
|
|
|
#define string_view_lite_MAJOR 1
|
|
#define string_view_lite_MINOR 7
|
|
#define string_view_lite_PATCH 0
|
|
|
|
#define string_view_lite_VERSION nssv_STRINGIFY(string_view_lite_MAJOR) "." nssv_STRINGIFY(string_view_lite_MINOR) "." nssv_STRINGIFY(string_view_lite_PATCH)
|
|
|
|
#define nssv_STRINGIFY( x ) nssv_STRINGIFY_( x )
|
|
#define nssv_STRINGIFY_( x ) #x
|
|
|
|
// string-view lite configuration:
|
|
|
|
#define nssv_STRING_VIEW_DEFAULT 0
|
|
#define nssv_STRING_VIEW_NONSTD 1
|
|
#define nssv_STRING_VIEW_STD 2
|
|
|
|
// tweak header support:
|
|
|
|
#ifdef __has_include
|
|
# if __has_include(<nonstd/string_view.tweak.hpp>)
|
|
# include <nonstd/string_view.tweak.hpp>
|
|
# endif
|
|
#define nssv_HAVE_TWEAK_HEADER 1
|
|
#else
|
|
#define nssv_HAVE_TWEAK_HEADER 0
|
|
//# pragma message("string_view.hpp: Note: Tweak header not supported.")
|
|
#endif
|
|
|
|
// string_view selection and configuration:
|
|
|
|
#if !defined( nssv_CONFIG_SELECT_STRING_VIEW )
|
|
# define nssv_CONFIG_SELECT_STRING_VIEW ( nssv_HAVE_STD_STRING_VIEW ? nssv_STRING_VIEW_STD : nssv_STRING_VIEW_NONSTD )
|
|
#endif
|
|
|
|
#ifndef nssv_CONFIG_STD_SV_OPERATOR
|
|
# define nssv_CONFIG_STD_SV_OPERATOR 0
|
|
#endif
|
|
|
|
#ifndef nssv_CONFIG_USR_SV_OPERATOR
|
|
# define nssv_CONFIG_USR_SV_OPERATOR 1
|
|
#endif
|
|
|
|
#ifdef nssv_CONFIG_CONVERSION_STD_STRING
|
|
# define nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS nssv_CONFIG_CONVERSION_STD_STRING
|
|
# define nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS nssv_CONFIG_CONVERSION_STD_STRING
|
|
#endif
|
|
|
|
#ifndef nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
|
|
# define nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS 1
|
|
#endif
|
|
|
|
#ifndef nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
|
|
# define nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS 1
|
|
#endif
|
|
|
|
#ifndef nssv_CONFIG_NO_STREAM_INSERTION
|
|
# define nssv_CONFIG_NO_STREAM_INSERTION 0
|
|
#endif
|
|
|
|
#ifndef nssv_CONFIG_CONSTEXPR11_STD_SEARCH
|
|
# define nssv_CONFIG_CONSTEXPR11_STD_SEARCH 1
|
|
#endif
|
|
|
|
// Control presence of exception handling (try and auto discover):
|
|
|
|
#ifndef nssv_CONFIG_NO_EXCEPTIONS
|
|
# if defined(_MSC_VER)
|
|
# include <cstddef> // for _HAS_EXCEPTIONS
|
|
# endif
|
|
# if defined(__cpp_exceptions) || defined(__EXCEPTIONS) || (_HAS_EXCEPTIONS)
|
|
# define nssv_CONFIG_NO_EXCEPTIONS 0
|
|
# else
|
|
# define nssv_CONFIG_NO_EXCEPTIONS 1
|
|
# endif
|
|
#endif
|
|
|
|
// C++ language version detection (C++23 is speculative):
|
|
// Note: VC14.0/1900 (VS2015) lacks too much from C++14.
|
|
|
|
#ifndef nssv_CPLUSPLUS
|
|
# if defined(_MSVC_LANG ) && !defined(__clang__)
|
|
# define nssv_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG )
|
|
# else
|
|
# define nssv_CPLUSPLUS __cplusplus
|
|
# endif
|
|
#endif
|
|
|
|
#define nssv_CPP98_OR_GREATER ( nssv_CPLUSPLUS >= 199711L )
|
|
#define nssv_CPP11_OR_GREATER ( nssv_CPLUSPLUS >= 201103L )
|
|
#define nssv_CPP11_OR_GREATER_ ( nssv_CPLUSPLUS >= 201103L )
|
|
#define nssv_CPP14_OR_GREATER ( nssv_CPLUSPLUS >= 201402L )
|
|
#define nssv_CPP17_OR_GREATER ( nssv_CPLUSPLUS >= 201703L )
|
|
#define nssv_CPP20_OR_GREATER ( nssv_CPLUSPLUS >= 202002L )
|
|
#define nssv_CPP23_OR_GREATER ( nssv_CPLUSPLUS >= 202300L )
|
|
|
|
// use C++17 std::string_view if available and requested:
|
|
|
|
#if nssv_CPP17_OR_GREATER && defined(__has_include )
|
|
# if __has_include( <string_view> )
|
|
# define nssv_HAVE_STD_STRING_VIEW 1
|
|
# else
|
|
# define nssv_HAVE_STD_STRING_VIEW 0
|
|
# endif
|
|
#else
|
|
# define nssv_HAVE_STD_STRING_VIEW 0
|
|
#endif
|
|
|
|
#define nssv_USES_STD_STRING_VIEW ( (nssv_CONFIG_SELECT_STRING_VIEW == nssv_STRING_VIEW_STD) || ((nssv_CONFIG_SELECT_STRING_VIEW == nssv_STRING_VIEW_DEFAULT) && nssv_HAVE_STD_STRING_VIEW) )
|
|
|
|
#define nssv_HAVE_STARTS_WITH ( nssv_CPP20_OR_GREATER || !nssv_USES_STD_STRING_VIEW )
|
|
#define nssv_HAVE_ENDS_WITH nssv_HAVE_STARTS_WITH
|
|
|
|
//
|
|
// Use C++17 std::string_view:
|
|
//
|
|
|
|
#if nssv_USES_STD_STRING_VIEW
|
|
|
|
#include <string_view>
|
|
|
|
// Extensions for std::string:
|
|
|
|
#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
|
|
|
|
namespace nonstd {
|
|
|
|
template< class CharT, class Traits, class Allocator = std::allocator<CharT> >
|
|
std::basic_string<CharT, Traits, Allocator>
|
|
to_string( std::basic_string_view<CharT, Traits> v, Allocator const & a = Allocator() )
|
|
{
|
|
return std::basic_string<CharT,Traits, Allocator>( v.begin(), v.end(), a );
|
|
}
|
|
|
|
template< class CharT, class Traits, class Allocator >
|
|
std::basic_string_view<CharT, Traits>
|
|
to_string_view( std::basic_string<CharT, Traits, Allocator> const & s )
|
|
{
|
|
return std::basic_string_view<CharT, Traits>( s.data(), s.size() );
|
|
}
|
|
|
|
// Literal operators sv and _sv:
|
|
|
|
#if nssv_CONFIG_STD_SV_OPERATOR
|
|
|
|
using namespace std::literals::string_view_literals;
|
|
|
|
#endif
|
|
|
|
#if nssv_CONFIG_USR_SV_OPERATOR
|
|
|
|
inline namespace literals {
|
|
inline namespace string_view_literals {
|
|
|
|
|
|
constexpr std::string_view operator "" _sv( const char* str, size_t len ) noexcept // (1)
|
|
{
|
|
return std::string_view{ str, len };
|
|
}
|
|
|
|
constexpr std::u16string_view operator "" _sv( const char16_t* str, size_t len ) noexcept // (2)
|
|
{
|
|
return std::u16string_view{ str, len };
|
|
}
|
|
|
|
constexpr std::u32string_view operator "" _sv( const char32_t* str, size_t len ) noexcept // (3)
|
|
{
|
|
return std::u32string_view{ str, len };
|
|
}
|
|
|
|
constexpr std::wstring_view operator "" _sv( const wchar_t* str, size_t len ) noexcept // (4)
|
|
{
|
|
return std::wstring_view{ str, len };
|
|
}
|
|
|
|
}} // namespace literals::string_view_literals
|
|
|
|
#endif // nssv_CONFIG_USR_SV_OPERATOR
|
|
|
|
} // namespace nonstd
|
|
|
|
#endif // nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
|
|
|
|
namespace nonstd {
|
|
|
|
using std::string_view;
|
|
using std::wstring_view;
|
|
using std::u16string_view;
|
|
using std::u32string_view;
|
|
using std::basic_string_view;
|
|
|
|
// literal "sv" and "_sv", see above
|
|
|
|
using std::operator==;
|
|
using std::operator!=;
|
|
using std::operator<;
|
|
using std::operator<=;
|
|
using std::operator>;
|
|
using std::operator>=;
|
|
|
|
using std::operator<<;
|
|
|
|
} // namespace nonstd
|
|
|
|
#else // nssv_HAVE_STD_STRING_VIEW
|
|
|
|
//
|
|
// Before C++17: use string_view lite:
|
|
//
|
|
|
|
// Compiler versions:
|
|
//
|
|
// MSVC++ 6.0 _MSC_VER == 1200 nssv_COMPILER_MSVC_VERSION == 60 (Visual Studio 6.0)
|
|
// MSVC++ 7.0 _MSC_VER == 1300 nssv_COMPILER_MSVC_VERSION == 70 (Visual Studio .NET 2002)
|
|
// MSVC++ 7.1 _MSC_VER == 1310 nssv_COMPILER_MSVC_VERSION == 71 (Visual Studio .NET 2003)
|
|
// MSVC++ 8.0 _MSC_VER == 1400 nssv_COMPILER_MSVC_VERSION == 80 (Visual Studio 2005)
|
|
// MSVC++ 9.0 _MSC_VER == 1500 nssv_COMPILER_MSVC_VERSION == 90 (Visual Studio 2008)
|
|
// MSVC++ 10.0 _MSC_VER == 1600 nssv_COMPILER_MSVC_VERSION == 100 (Visual Studio 2010)
|
|
// MSVC++ 11.0 _MSC_VER == 1700 nssv_COMPILER_MSVC_VERSION == 110 (Visual Studio 2012)
|
|
// MSVC++ 12.0 _MSC_VER == 1800 nssv_COMPILER_MSVC_VERSION == 120 (Visual Studio 2013)
|
|
// MSVC++ 14.0 _MSC_VER == 1900 nssv_COMPILER_MSVC_VERSION == 140 (Visual Studio 2015)
|
|
// MSVC++ 14.1 _MSC_VER >= 1910 nssv_COMPILER_MSVC_VERSION == 141 (Visual Studio 2017)
|
|
// MSVC++ 14.2 _MSC_VER >= 1920 nssv_COMPILER_MSVC_VERSION == 142 (Visual Studio 2019)
|
|
|
|
#if defined(_MSC_VER ) && !defined(__clang__)
|
|
# define nssv_COMPILER_MSVC_VER (_MSC_VER )
|
|
# define nssv_COMPILER_MSVC_VERSION (_MSC_VER / 10 - 10 * ( 5 + (_MSC_VER < 1900 ) ) )
|
|
#else
|
|
# define nssv_COMPILER_MSVC_VER 0
|
|
# define nssv_COMPILER_MSVC_VERSION 0
|
|
#endif
|
|
|
|
#define nssv_COMPILER_VERSION( major, minor, patch ) ( 10 * ( 10 * (major) + (minor) ) + (patch) )
|
|
|
|
#if defined( __apple_build_version__ )
|
|
# define nssv_COMPILER_APPLECLANG_VERSION nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__)
|
|
# define nssv_COMPILER_CLANG_VERSION 0
|
|
#elif defined( __clang__ )
|
|
# define nssv_COMPILER_APPLECLANG_VERSION 0
|
|
# define nssv_COMPILER_CLANG_VERSION nssv_COMPILER_VERSION(__clang_major__, __clang_minor__, __clang_patchlevel__)
|
|
#else
|
|
# define nssv_COMPILER_APPLECLANG_VERSION 0
|
|
# define nssv_COMPILER_CLANG_VERSION 0
|
|
#endif
|
|
|
|
#if defined(__GNUC__) && !defined(__clang__)
|
|
# define nssv_COMPILER_GNUC_VERSION nssv_COMPILER_VERSION(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
|
|
#else
|
|
# define nssv_COMPILER_GNUC_VERSION 0
|
|
#endif
|
|
|
|
// half-open range [lo..hi):
|
|
#define nssv_BETWEEN( v, lo, hi ) ( (lo) <= (v) && (v) < (hi) )
|
|
|
|
// Presence of language and library features:
|
|
|
|
#ifdef _HAS_CPP0X
|
|
# define nssv_HAS_CPP0X _HAS_CPP0X
|
|
#else
|
|
# define nssv_HAS_CPP0X 0
|
|
#endif
|
|
|
|
// Unless defined otherwise below, consider VC14 as C++11 for string-view-lite:
|
|
|
|
#if nssv_COMPILER_MSVC_VER >= 1900
|
|
# undef nssv_CPP11_OR_GREATER
|
|
# define nssv_CPP11_OR_GREATER 1
|
|
#endif
|
|
|
|
#define nssv_CPP11_90 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1500)
|
|
#define nssv_CPP11_100 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1600)
|
|
#define nssv_CPP11_110 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1700)
|
|
#define nssv_CPP11_120 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1800)
|
|
#define nssv_CPP11_140 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1900)
|
|
#define nssv_CPP11_141 (nssv_CPP11_OR_GREATER_ || nssv_COMPILER_MSVC_VER >= 1910)
|
|
|
|
#define nssv_CPP14_000 (nssv_CPP14_OR_GREATER)
|
|
#define nssv_CPP17_000 (nssv_CPP17_OR_GREATER)
|
|
|
|
// Presence of C++11 language features:
|
|
|
|
#define nssv_HAVE_CONSTEXPR_11 nssv_CPP11_140
|
|
#define nssv_HAVE_EXPLICIT_CONVERSION nssv_CPP11_140
|
|
#define nssv_HAVE_INLINE_NAMESPACE nssv_CPP11_140
|
|
#define nssv_HAVE_IS_DEFAULT nssv_CPP11_140
|
|
#define nssv_HAVE_IS_DELETE nssv_CPP11_140
|
|
#define nssv_HAVE_NOEXCEPT nssv_CPP11_140
|
|
#define nssv_HAVE_NULLPTR nssv_CPP11_100
|
|
#define nssv_HAVE_REF_QUALIFIER nssv_CPP11_140
|
|
#define nssv_HAVE_UNICODE_LITERALS nssv_CPP11_140
|
|
#define nssv_HAVE_USER_DEFINED_LITERALS nssv_CPP11_140
|
|
#define nssv_HAVE_WCHAR16_T nssv_CPP11_100
|
|
#define nssv_HAVE_WCHAR32_T nssv_CPP11_100
|
|
|
|
#if ! ( ( nssv_CPP11_OR_GREATER && nssv_COMPILER_CLANG_VERSION ) || nssv_BETWEEN( nssv_COMPILER_CLANG_VERSION, 300, 400 ) )
|
|
# define nssv_HAVE_STD_DEFINED_LITERALS nssv_CPP11_140
|
|
#else
|
|
# define nssv_HAVE_STD_DEFINED_LITERALS 0
|
|
#endif
|
|
|
|
// Presence of C++14 language features:
|
|
|
|
#define nssv_HAVE_CONSTEXPR_14 nssv_CPP14_000
|
|
|
|
// Presence of C++17 language features:
|
|
|
|
#define nssv_HAVE_NODISCARD nssv_CPP17_000
|
|
|
|
// Presence of C++ library features:
|
|
|
|
#define nssv_HAVE_STD_HASH nssv_CPP11_120
|
|
|
|
// Presence of compiler intrinsics:
|
|
|
|
// Providing char-type specializations for compare() and length() that
|
|
// use compiler intrinsics can improve compile- and run-time performance.
|
|
//
|
|
// The challenge is in using the right combinations of builtin availability
|
|
// and its constexpr-ness.
|
|
//
|
|
// | compiler | __builtin_memcmp (constexpr) | memcmp (constexpr) |
|
|
// |----------|------------------------------|---------------------|
|
|
// | clang | 4.0 (>= 4.0 ) | any (? ) |
|
|
// | clang-a | 9.0 (>= 9.0 ) | any (? ) |
|
|
// | gcc | any (constexpr) | any (? ) |
|
|
// | msvc | >= 14.2 C++17 (>= 14.2 ) | any (? ) |
|
|
|
|
#define nssv_HAVE_BUILTIN_VER ( (nssv_CPP17_000 && nssv_COMPILER_MSVC_VERSION >= 142) || nssv_COMPILER_GNUC_VERSION > 0 || nssv_COMPILER_CLANG_VERSION >= 400 || nssv_COMPILER_APPLECLANG_VERSION >= 900 )
|
|
#define nssv_HAVE_BUILTIN_CE ( nssv_HAVE_BUILTIN_VER )
|
|
|
|
#define nssv_HAVE_BUILTIN_MEMCMP ( (nssv_HAVE_CONSTEXPR_14 && nssv_HAVE_BUILTIN_CE) || !nssv_HAVE_CONSTEXPR_14 )
|
|
#define nssv_HAVE_BUILTIN_STRLEN ( (nssv_HAVE_CONSTEXPR_11 && nssv_HAVE_BUILTIN_CE) || !nssv_HAVE_CONSTEXPR_11 )
|
|
|
|
#ifdef __has_builtin
|
|
# define nssv_HAVE_BUILTIN( x ) __has_builtin( x )
|
|
#else
|
|
# define nssv_HAVE_BUILTIN( x ) 0
|
|
#endif
|
|
|
|
#if nssv_HAVE_BUILTIN(__builtin_memcmp) || nssv_HAVE_BUILTIN_VER
|
|
# define nssv_BUILTIN_MEMCMP __builtin_memcmp
|
|
#else
|
|
# define nssv_BUILTIN_MEMCMP memcmp
|
|
#endif
|
|
|
|
#if nssv_HAVE_BUILTIN(__builtin_strlen) || nssv_HAVE_BUILTIN_VER
|
|
# define nssv_BUILTIN_STRLEN __builtin_strlen
|
|
#else
|
|
# define nssv_BUILTIN_STRLEN strlen
|
|
#endif
|
|
|
|
// C++ feature usage:
|
|
|
|
#if nssv_HAVE_CONSTEXPR_11
|
|
# define nssv_constexpr constexpr
|
|
#else
|
|
# define nssv_constexpr /*constexpr*/
|
|
#endif
|
|
|
|
#if nssv_HAVE_CONSTEXPR_14
|
|
# define nssv_constexpr14 constexpr
|
|
#else
|
|
# define nssv_constexpr14 /*constexpr*/
|
|
#endif
|
|
|
|
#if nssv_HAVE_EXPLICIT_CONVERSION
|
|
# define nssv_explicit explicit
|
|
#else
|
|
# define nssv_explicit /*explicit*/
|
|
#endif
|
|
|
|
#if nssv_HAVE_INLINE_NAMESPACE
|
|
# define nssv_inline_ns inline
|
|
#else
|
|
# define nssv_inline_ns /*inline*/
|
|
#endif
|
|
|
|
#if nssv_HAVE_NOEXCEPT
|
|
# define nssv_noexcept noexcept
|
|
#else
|
|
# define nssv_noexcept /*noexcept*/
|
|
#endif
|
|
|
|
//#if nssv_HAVE_REF_QUALIFIER
|
|
//# define nssv_ref_qual &
|
|
//# define nssv_refref_qual &&
|
|
//#else
|
|
//# define nssv_ref_qual /*&*/
|
|
//# define nssv_refref_qual /*&&*/
|
|
//#endif
|
|
|
|
#if nssv_HAVE_NULLPTR
|
|
# define nssv_nullptr nullptr
|
|
#else
|
|
# define nssv_nullptr NULL
|
|
#endif
|
|
|
|
#if nssv_HAVE_NODISCARD
|
|
# define nssv_nodiscard [[nodiscard]]
|
|
#else
|
|
# define nssv_nodiscard /*[[nodiscard]]*/
|
|
#endif
|
|
|
|
// Additional includes:
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <iterator>
|
|
#include <limits>
|
|
#include <string> // std::char_traits<>
|
|
|
|
#if ! nssv_CONFIG_NO_STREAM_INSERTION
|
|
# include <ostream>
|
|
#endif
|
|
|
|
#if ! nssv_CONFIG_NO_EXCEPTIONS
|
|
# include <stdexcept>
|
|
#endif
|
|
|
|
#if nssv_CPP11_OR_GREATER
|
|
# include <type_traits>
|
|
#endif
|
|
|
|
// Clang, GNUC, MSVC warning suppression macros:
|
|
|
|
#if defined(__clang__)
|
|
# pragma clang diagnostic ignored "-Wreserved-user-defined-literal"
|
|
# pragma clang diagnostic push
|
|
# pragma clang diagnostic ignored "-Wuser-defined-literals"
|
|
#elif nssv_COMPILER_GNUC_VERSION >= 480
|
|
# pragma GCC diagnostic push
|
|
# pragma GCC diagnostic ignored "-Wliteral-suffix"
|
|
#endif // __clang__
|
|
|
|
#if nssv_COMPILER_MSVC_VERSION >= 140
|
|
# define nssv_SUPPRESS_MSGSL_WARNING(expr) [[gsl::suppress(expr)]]
|
|
# define nssv_SUPPRESS_MSVC_WARNING(code, descr) __pragma(warning(suppress: code) )
|
|
# define nssv_DISABLE_MSVC_WARNINGS(codes) __pragma(warning(push)) __pragma(warning(disable: codes))
|
|
#else
|
|
# define nssv_SUPPRESS_MSGSL_WARNING(expr)
|
|
# define nssv_SUPPRESS_MSVC_WARNING(code, descr)
|
|
# define nssv_DISABLE_MSVC_WARNINGS(codes)
|
|
#endif
|
|
|
|
#if defined(__clang__)
|
|
# define nssv_RESTORE_WARNINGS() _Pragma("clang diagnostic pop")
|
|
#elif nssv_COMPILER_GNUC_VERSION >= 480
|
|
# define nssv_RESTORE_WARNINGS() _Pragma("GCC diagnostic pop")
|
|
#elif nssv_COMPILER_MSVC_VERSION >= 140
|
|
# define nssv_RESTORE_WARNINGS() __pragma(warning(pop ))
|
|
#else
|
|
# define nssv_RESTORE_WARNINGS()
|
|
#endif
|
|
|
|
// Suppress the following MSVC (GSL) warnings:
|
|
// - C4455, non-gsl : 'operator ""sv': literal suffix identifiers that do not
|
|
// start with an underscore are reserved
|
|
// - C26472, gsl::t.1 : don't use a static_cast for arithmetic conversions;
|
|
// use brace initialization, gsl::narrow_cast or gsl::narow
|
|
// - C26481: gsl::b.1 : don't use pointer arithmetic. Use span instead
|
|
|
|
nssv_DISABLE_MSVC_WARNINGS( 4455 26481 26472 )
|
|
//nssv_DISABLE_CLANG_WARNINGS( "-Wuser-defined-literals" )
|
|
//nssv_DISABLE_GNUC_WARNINGS( -Wliteral-suffix )
|
|
|
|
namespace nonstd { namespace sv_lite {
|
|
|
|
//
|
|
// basic_string_view declaration:
|
|
//
|
|
|
|
template
|
|
<
|
|
class CharT,
|
|
class Traits = std::char_traits<CharT>
|
|
>
|
|
class basic_string_view;
|
|
|
|
namespace detail {
|
|
|
|
// support constexpr comparison in C++14;
|
|
// for C++17 and later, use provided traits:
|
|
|
|
template< typename CharT >
|
|
inline nssv_constexpr14 int compare( CharT const * s1, CharT const * s2, std::size_t count )
|
|
{
|
|
while ( count-- != 0 )
|
|
{
|
|
if ( *s1 < *s2 ) return -1;
|
|
if ( *s1 > *s2 ) return +1;
|
|
++s1; ++s2;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
#if nssv_HAVE_BUILTIN_MEMCMP
|
|
|
|
// specialization of compare() for char, see also generic compare() above:
|
|
|
|
inline nssv_constexpr14 int compare( char const * s1, char const * s2, std::size_t count )
|
|
{
|
|
return nssv_BUILTIN_MEMCMP( s1, s2, count );
|
|
}
|
|
|
|
#endif
|
|
|
|
#if nssv_HAVE_BUILTIN_STRLEN
|
|
|
|
// specialization of length() for char, see also generic length() further below:
|
|
|
|
inline nssv_constexpr std::size_t length( char const * s )
|
|
{
|
|
return nssv_BUILTIN_STRLEN( s );
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined(__OPTIMIZE__)
|
|
|
|
// gcc, clang provide __OPTIMIZE__
|
|
// Expect tail call optimization to make length() non-recursive:
|
|
|
|
template< typename CharT >
|
|
inline nssv_constexpr std::size_t length( CharT * s, std::size_t result = 0 )
|
|
{
|
|
return *s == '\0' ? result : length( s + 1, result + 1 );
|
|
}
|
|
|
|
#else // OPTIMIZE
|
|
|
|
// non-recursive:
|
|
|
|
template< typename CharT >
|
|
inline nssv_constexpr14 std::size_t length( CharT * s )
|
|
{
|
|
std::size_t result = 0;
|
|
while ( *s++ != '\0' )
|
|
{
|
|
++result;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
#endif // OPTIMIZE
|
|
|
|
#if nssv_CPP11_OR_GREATER && ! nssv_CPP17_OR_GREATER
|
|
#if defined(__OPTIMIZE__)
|
|
|
|
// gcc, clang provide __OPTIMIZE__
|
|
// Expect tail call optimization to make search() non-recursive:
|
|
|
|
template< class CharT, class Traits = std::char_traits<CharT> >
|
|
constexpr const CharT* search( basic_string_view<CharT, Traits> haystack, basic_string_view<CharT, Traits> needle )
|
|
{
|
|
return haystack.starts_with( needle ) ? haystack.begin() :
|
|
haystack.empty() ? haystack.end() : search( haystack.substr(1), needle );
|
|
}
|
|
|
|
#else // OPTIMIZE
|
|
|
|
// non-recursive:
|
|
|
|
#if nssv_CONFIG_CONSTEXPR11_STD_SEARCH
|
|
|
|
template< class CharT, class Traits = std::char_traits<CharT> >
|
|
constexpr const CharT* search( basic_string_view<CharT, Traits> haystack, basic_string_view<CharT, Traits> needle )
|
|
{
|
|
return std::search( haystack.begin(), haystack.end(), needle.begin(), needle.end() );
|
|
}
|
|
|
|
#else // nssv_CONFIG_CONSTEXPR11_STD_SEARCH
|
|
|
|
template< class CharT, class Traits = std::char_traits<CharT> >
|
|
nssv_constexpr14 const CharT* search( basic_string_view<CharT, Traits> haystack, basic_string_view<CharT, Traits> needle )
|
|
{
|
|
while ( needle.size() <= haystack.size() )
|
|
{
|
|
if ( haystack.starts_with(needle) )
|
|
{
|
|
return haystack.cbegin();
|
|
}
|
|
haystack = basic_string_view<CharT, Traits>{ haystack.begin() + 1, haystack.size() - 1U };
|
|
}
|
|
return haystack.cend();
|
|
}
|
|
#endif // nssv_CONFIG_CONSTEXPR11_STD_SEARCH
|
|
|
|
#endif // OPTIMIZE
|
|
#endif // nssv_CPP11_OR_GREATER && ! nssv_CPP17_OR_GREATER
|
|
|
|
} // namespace detail
|
|
|
|
//
|
|
// basic_string_view:
|
|
//
|
|
|
|
template
|
|
<
|
|
class CharT,
|
|
class Traits /* = std::char_traits<CharT> */
|
|
>
|
|
class basic_string_view
|
|
{
|
|
public:
|
|
// Member types:
|
|
|
|
typedef Traits traits_type;
|
|
typedef CharT value_type;
|
|
|
|
typedef CharT * pointer;
|
|
typedef CharT const * const_pointer;
|
|
typedef CharT & reference;
|
|
typedef CharT const & const_reference;
|
|
|
|
typedef const_pointer iterator;
|
|
typedef const_pointer const_iterator;
|
|
typedef std::reverse_iterator< const_iterator > reverse_iterator;
|
|
typedef std::reverse_iterator< const_iterator > const_reverse_iterator;
|
|
|
|
typedef std::size_t size_type;
|
|
typedef std::ptrdiff_t difference_type;
|
|
|
|
// 24.4.2.1 Construction and assignment:
|
|
|
|
nssv_constexpr basic_string_view() nssv_noexcept
|
|
: data_( nssv_nullptr )
|
|
, size_( 0 )
|
|
{}
|
|
|
|
#if nssv_CPP11_OR_GREATER
|
|
nssv_constexpr basic_string_view( basic_string_view const & other ) nssv_noexcept = default;
|
|
#else
|
|
nssv_constexpr basic_string_view( basic_string_view const & other ) nssv_noexcept
|
|
: data_( other.data_)
|
|
, size_( other.size_)
|
|
{}
|
|
#endif
|
|
|
|
nssv_constexpr basic_string_view( CharT const * s, size_type count ) nssv_noexcept // non-standard noexcept
|
|
: data_( s )
|
|
, size_( count )
|
|
{}
|
|
|
|
nssv_constexpr basic_string_view( CharT const * s) nssv_noexcept // non-standard noexcept
|
|
: data_( s )
|
|
#if nssv_CPP17_OR_GREATER
|
|
, size_( Traits::length(s) )
|
|
#elif nssv_CPP11_OR_GREATER
|
|
, size_( detail::length(s) )
|
|
#else
|
|
, size_( Traits::length(s) )
|
|
#endif
|
|
{}
|
|
|
|
#if nssv_HAVE_NULLPTR
|
|
# if nssv_HAVE_IS_DELETE
|
|
nssv_constexpr basic_string_view( std::nullptr_t ) nssv_noexcept = delete;
|
|
# else
|
|
private: nssv_constexpr basic_string_view( std::nullptr_t ) nssv_noexcept; public:
|
|
# endif
|
|
#endif
|
|
|
|
// Assignment:
|
|
|
|
#if nssv_CPP11_OR_GREATER
|
|
nssv_constexpr14 basic_string_view & operator=( basic_string_view const & other ) nssv_noexcept = default;
|
|
#else
|
|
nssv_constexpr14 basic_string_view & operator=( basic_string_view const & other ) nssv_noexcept
|
|
{
|
|
data_ = other.data_;
|
|
size_ = other.size_;
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
// 24.4.2.2 Iterator support:
|
|
|
|
nssv_constexpr const_iterator begin() const nssv_noexcept { return data_; }
|
|
nssv_constexpr const_iterator end() const nssv_noexcept { return data_ + size_; }
|
|
|
|
nssv_constexpr const_iterator cbegin() const nssv_noexcept { return begin(); }
|
|
nssv_constexpr const_iterator cend() const nssv_noexcept { return end(); }
|
|
|
|
nssv_constexpr const_reverse_iterator rbegin() const nssv_noexcept { return const_reverse_iterator( end() ); }
|
|
nssv_constexpr const_reverse_iterator rend() const nssv_noexcept { return const_reverse_iterator( begin() ); }
|
|
|
|
nssv_constexpr const_reverse_iterator crbegin() const nssv_noexcept { return rbegin(); }
|
|
nssv_constexpr const_reverse_iterator crend() const nssv_noexcept { return rend(); }
|
|
|
|
// 24.4.2.3 Capacity:
|
|
|
|
nssv_constexpr size_type size() const nssv_noexcept { return size_; }
|
|
nssv_constexpr size_type length() const nssv_noexcept { return size_; }
|
|
nssv_constexpr size_type max_size() const nssv_noexcept { return (std::numeric_limits< size_type >::max)(); }
|
|
|
|
// since C++20
|
|
nssv_nodiscard nssv_constexpr bool empty() const nssv_noexcept
|
|
{
|
|
return 0 == size_;
|
|
}
|
|
|
|
// 24.4.2.4 Element access:
|
|
|
|
nssv_constexpr const_reference operator[]( size_type pos ) const
|
|
{
|
|
return data_at( pos );
|
|
}
|
|
|
|
nssv_constexpr14 const_reference at( size_type pos ) const
|
|
{
|
|
#if nssv_CONFIG_NO_EXCEPTIONS
|
|
assert( pos < size() );
|
|
#else
|
|
if ( pos >= size() )
|
|
{
|
|
throw std::out_of_range("nonstd::string_view::at()");
|
|
}
|
|
#endif
|
|
return data_at( pos );
|
|
}
|
|
|
|
nssv_constexpr const_reference front() const { return data_at( 0 ); }
|
|
nssv_constexpr const_reference back() const { return data_at( size() - 1 ); }
|
|
|
|
nssv_constexpr const_pointer data() const nssv_noexcept { return data_; }
|
|
|
|
// 24.4.2.5 Modifiers:
|
|
|
|
nssv_constexpr14 void remove_prefix( size_type n )
|
|
{
|
|
assert( n <= size() );
|
|
data_ += n;
|
|
size_ -= n;
|
|
}
|
|
|
|
nssv_constexpr14 void remove_suffix( size_type n )
|
|
{
|
|
assert( n <= size() );
|
|
size_ -= n;
|
|
}
|
|
|
|
nssv_constexpr14 void swap( basic_string_view & other ) nssv_noexcept
|
|
{
|
|
const basic_string_view tmp(other);
|
|
other = *this;
|
|
*this = tmp;
|
|
}
|
|
|
|
// 24.4.2.6 String operations:
|
|
|
|
size_type copy( CharT * dest, size_type n, size_type pos = 0 ) const
|
|
{
|
|
#if nssv_CONFIG_NO_EXCEPTIONS
|
|
assert( pos <= size() );
|
|
#else
|
|
if ( pos > size() )
|
|
{
|
|
throw std::out_of_range("nonstd::string_view::copy()");
|
|
}
|
|
#endif
|
|
const size_type rlen = (std::min)( n, size() - pos );
|
|
|
|
(void) Traits::copy( dest, data() + pos, rlen );
|
|
|
|
return rlen;
|
|
}
|
|
|
|
nssv_constexpr14 basic_string_view substr( size_type pos = 0, size_type n = npos ) const
|
|
{
|
|
#if nssv_CONFIG_NO_EXCEPTIONS
|
|
assert( pos <= size() );
|
|
#else
|
|
if ( pos > size() )
|
|
{
|
|
throw std::out_of_range("nonstd::string_view::substr()");
|
|
}
|
|
#endif
|
|
return basic_string_view( data() + pos, (std::min)( n, size() - pos ) );
|
|
}
|
|
|
|
// compare(), 6x:
|
|
|
|
nssv_constexpr14 int compare( basic_string_view other ) const nssv_noexcept // (1)
|
|
{
|
|
#if nssv_CPP17_OR_GREATER
|
|
if ( const int result = Traits::compare( data(), other.data(), (std::min)( size(), other.size() ) ) )
|
|
#else
|
|
if ( const int result = detail::compare( data(), other.data(), (std::min)( size(), other.size() ) ) )
|
|
#endif
|
|
{
|
|
return result;
|
|
}
|
|
|
|
return size() == other.size() ? 0 : size() < other.size() ? -1 : 1;
|
|
}
|
|
|
|
nssv_constexpr int compare( size_type pos1, size_type n1, basic_string_view other ) const // (2)
|
|
{
|
|
return substr( pos1, n1 ).compare( other );
|
|
}
|
|
|
|
nssv_constexpr int compare( size_type pos1, size_type n1, basic_string_view other, size_type pos2, size_type n2 ) const // (3)
|
|
{
|
|
return substr( pos1, n1 ).compare( other.substr( pos2, n2 ) );
|
|
}
|
|
|
|
nssv_constexpr int compare( CharT const * s ) const // (4)
|
|
{
|
|
return compare( basic_string_view( s ) );
|
|
}
|
|
|
|
nssv_constexpr int compare( size_type pos1, size_type n1, CharT const * s ) const // (5)
|
|
{
|
|
return substr( pos1, n1 ).compare( basic_string_view( s ) );
|
|
}
|
|
|
|
nssv_constexpr int compare( size_type pos1, size_type n1, CharT const * s, size_type n2 ) const // (6)
|
|
{
|
|
return substr( pos1, n1 ).compare( basic_string_view( s, n2 ) );
|
|
}
|
|
|
|
// 24.4.2.7 Searching:
|
|
|
|
// starts_with(), 3x, since C++20:
|
|
|
|
nssv_constexpr bool starts_with( basic_string_view v ) const nssv_noexcept // (1)
|
|
{
|
|
return size() >= v.size() && compare( 0, v.size(), v ) == 0;
|
|
}
|
|
|
|
nssv_constexpr bool starts_with( CharT c ) const nssv_noexcept // (2)
|
|
{
|
|
return starts_with( basic_string_view( &c, 1 ) );
|
|
}
|
|
|
|
nssv_constexpr bool starts_with( CharT const * s ) const // (3)
|
|
{
|
|
return starts_with( basic_string_view( s ) );
|
|
}
|
|
|
|
// ends_with(), 3x, since C++20:
|
|
|
|
nssv_constexpr bool ends_with( basic_string_view v ) const nssv_noexcept // (1)
|
|
{
|
|
return size() >= v.size() && compare( size() - v.size(), npos, v ) == 0;
|
|
}
|
|
|
|
nssv_constexpr bool ends_with( CharT c ) const nssv_noexcept // (2)
|
|
{
|
|
return ends_with( basic_string_view( &c, 1 ) );
|
|
}
|
|
|
|
nssv_constexpr bool ends_with( CharT const * s ) const // (3)
|
|
{
|
|
return ends_with( basic_string_view( s ) );
|
|
}
|
|
|
|
// find(), 4x:
|
|
|
|
nssv_constexpr14 size_type find( basic_string_view v, size_type pos = 0 ) const nssv_noexcept // (1)
|
|
{
|
|
return assert( v.size() == 0 || v.data() != nssv_nullptr )
|
|
, pos >= size()
|
|
? npos : to_pos(
|
|
#if nssv_CPP11_OR_GREATER && ! nssv_CPP17_OR_GREATER
|
|
detail::search( substr(pos), v )
|
|
#else
|
|
std::search( cbegin() + pos, cend(), v.cbegin(), v.cend(), Traits::eq )
|
|
#endif
|
|
);
|
|
}
|
|
|
|
nssv_constexpr size_type find( CharT c, size_type pos = 0 ) const nssv_noexcept // (2)
|
|
{
|
|
return find( basic_string_view( &c, 1 ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find( CharT const * s, size_type pos, size_type n ) const // (3)
|
|
{
|
|
return find( basic_string_view( s, n ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find( CharT const * s, size_type pos = 0 ) const // (4)
|
|
{
|
|
return find( basic_string_view( s ), pos );
|
|
}
|
|
|
|
// rfind(), 4x:
|
|
|
|
nssv_constexpr14 size_type rfind( basic_string_view v, size_type pos = npos ) const nssv_noexcept // (1)
|
|
{
|
|
if ( size() < v.size() )
|
|
{
|
|
return npos;
|
|
}
|
|
|
|
if ( v.empty() )
|
|
{
|
|
return (std::min)( size(), pos );
|
|
}
|
|
|
|
const_iterator last = cbegin() + (std::min)( size() - v.size(), pos ) + v.size();
|
|
const_iterator result = std::find_end( cbegin(), last, v.cbegin(), v.cend(), Traits::eq );
|
|
|
|
return result != last ? size_type( result - cbegin() ) : npos;
|
|
}
|
|
|
|
nssv_constexpr14 size_type rfind( CharT c, size_type pos = npos ) const nssv_noexcept // (2)
|
|
{
|
|
return rfind( basic_string_view( &c, 1 ), pos );
|
|
}
|
|
|
|
nssv_constexpr14 size_type rfind( CharT const * s, size_type pos, size_type n ) const // (3)
|
|
{
|
|
return rfind( basic_string_view( s, n ), pos );
|
|
}
|
|
|
|
nssv_constexpr14 size_type rfind( CharT const * s, size_type pos = npos ) const // (4)
|
|
{
|
|
return rfind( basic_string_view( s ), pos );
|
|
}
|
|
|
|
// find_first_of(), 4x:
|
|
|
|
nssv_constexpr size_type find_first_of( basic_string_view v, size_type pos = 0 ) const nssv_noexcept // (1)
|
|
{
|
|
return pos >= size()
|
|
? npos
|
|
: to_pos( std::find_first_of( cbegin() + pos, cend(), v.cbegin(), v.cend(), Traits::eq ) );
|
|
}
|
|
|
|
nssv_constexpr size_type find_first_of( CharT c, size_type pos = 0 ) const nssv_noexcept // (2)
|
|
{
|
|
return find_first_of( basic_string_view( &c, 1 ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_first_of( CharT const * s, size_type pos, size_type n ) const // (3)
|
|
{
|
|
return find_first_of( basic_string_view( s, n ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_first_of( CharT const * s, size_type pos = 0 ) const // (4)
|
|
{
|
|
return find_first_of( basic_string_view( s ), pos );
|
|
}
|
|
|
|
// find_last_of(), 4x:
|
|
|
|
nssv_constexpr size_type find_last_of( basic_string_view v, size_type pos = npos ) const nssv_noexcept // (1)
|
|
{
|
|
return empty()
|
|
? npos
|
|
: pos >= size()
|
|
? find_last_of( v, size() - 1 )
|
|
: to_pos( std::find_first_of( const_reverse_iterator( cbegin() + pos + 1 ), crend(), v.cbegin(), v.cend(), Traits::eq ) );
|
|
}
|
|
|
|
nssv_constexpr size_type find_last_of( CharT c, size_type pos = npos ) const nssv_noexcept // (2)
|
|
{
|
|
return find_last_of( basic_string_view( &c, 1 ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_last_of( CharT const * s, size_type pos, size_type count ) const // (3)
|
|
{
|
|
return find_last_of( basic_string_view( s, count ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_last_of( CharT const * s, size_type pos = npos ) const // (4)
|
|
{
|
|
return find_last_of( basic_string_view( s ), pos );
|
|
}
|
|
|
|
// find_first_not_of(), 4x:
|
|
|
|
nssv_constexpr size_type find_first_not_of( basic_string_view v, size_type pos = 0 ) const nssv_noexcept // (1)
|
|
{
|
|
return pos >= size()
|
|
? npos
|
|
: to_pos( std::find_if( cbegin() + pos, cend(), not_in_view( v ) ) );
|
|
}
|
|
|
|
nssv_constexpr size_type find_first_not_of( CharT c, size_type pos = 0 ) const nssv_noexcept // (2)
|
|
{
|
|
return find_first_not_of( basic_string_view( &c, 1 ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_first_not_of( CharT const * s, size_type pos, size_type count ) const // (3)
|
|
{
|
|
return find_first_not_of( basic_string_view( s, count ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_first_not_of( CharT const * s, size_type pos = 0 ) const // (4)
|
|
{
|
|
return find_first_not_of( basic_string_view( s ), pos );
|
|
}
|
|
|
|
// find_last_not_of(), 4x:
|
|
|
|
nssv_constexpr size_type find_last_not_of( basic_string_view v, size_type pos = npos ) const nssv_noexcept // (1)
|
|
{
|
|
return empty()
|
|
? npos
|
|
: pos >= size()
|
|
? find_last_not_of( v, size() - 1 )
|
|
: to_pos( std::find_if( const_reverse_iterator( cbegin() + pos + 1 ), crend(), not_in_view( v ) ) );
|
|
}
|
|
|
|
nssv_constexpr size_type find_last_not_of( CharT c, size_type pos = npos ) const nssv_noexcept // (2)
|
|
{
|
|
return find_last_not_of( basic_string_view( &c, 1 ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_last_not_of( CharT const * s, size_type pos, size_type count ) const // (3)
|
|
{
|
|
return find_last_not_of( basic_string_view( s, count ), pos );
|
|
}
|
|
|
|
nssv_constexpr size_type find_last_not_of( CharT const * s, size_type pos = npos ) const // (4)
|
|
{
|
|
return find_last_not_of( basic_string_view( s ), pos );
|
|
}
|
|
|
|
// Constants:
|
|
|
|
#if nssv_CPP17_OR_GREATER
|
|
static nssv_constexpr size_type npos = size_type(-1);
|
|
#elif nssv_CPP11_OR_GREATER
|
|
enum : size_type { npos = size_type(-1) };
|
|
#else
|
|
enum { npos = size_type(-1) };
|
|
#endif
|
|
|
|
private:
|
|
struct not_in_view
|
|
{
|
|
const basic_string_view v;
|
|
|
|
nssv_constexpr explicit not_in_view( basic_string_view v_ ) : v( v_ ) {}
|
|
|
|
nssv_constexpr bool operator()( CharT c ) const
|
|
{
|
|
return npos == v.find_first_of( c );
|
|
}
|
|
};
|
|
|
|
nssv_constexpr size_type to_pos( const_iterator it ) const
|
|
{
|
|
return it == cend() ? npos : size_type( it - cbegin() );
|
|
}
|
|
|
|
nssv_constexpr size_type to_pos( const_reverse_iterator it ) const
|
|
{
|
|
return it == crend() ? npos : size_type( crend() - it - 1 );
|
|
}
|
|
|
|
nssv_constexpr const_reference data_at( size_type pos ) const
|
|
{
|
|
#if nssv_BETWEEN( nssv_COMPILER_GNUC_VERSION, 1, 500 )
|
|
return data_[pos];
|
|
#else
|
|
return assert( pos < size() ), data_[pos];
|
|
#endif
|
|
}
|
|
|
|
private:
|
|
const_pointer data_;
|
|
size_type size_;
|
|
|
|
public:
|
|
#if nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
|
|
|
|
template< class Allocator >
|
|
basic_string_view( std::basic_string<CharT, Traits, Allocator> const & s ) nssv_noexcept
|
|
: data_( s.data() )
|
|
, size_( s.size() )
|
|
{}
|
|
|
|
#if nssv_HAVE_EXPLICIT_CONVERSION
|
|
|
|
template< class Allocator >
|
|
explicit operator std::basic_string<CharT, Traits, Allocator>() const
|
|
{
|
|
return to_string( Allocator() );
|
|
}
|
|
|
|
#endif // nssv_HAVE_EXPLICIT_CONVERSION
|
|
|
|
#if nssv_CPP11_OR_GREATER
|
|
|
|
template< class Allocator = std::allocator<CharT> >
|
|
std::basic_string<CharT, Traits, Allocator>
|
|
to_string( Allocator const & a = Allocator() ) const
|
|
{
|
|
return std::basic_string<CharT, Traits, Allocator>( begin(), end(), a );
|
|
}
|
|
|
|
#else
|
|
|
|
std::basic_string<CharT, Traits>
|
|
to_string() const
|
|
{
|
|
return std::basic_string<CharT, Traits>( begin(), end() );
|
|
}
|
|
|
|
template< class Allocator >
|
|
std::basic_string<CharT, Traits, Allocator>
|
|
to_string( Allocator const & a ) const
|
|
{
|
|
return std::basic_string<CharT, Traits, Allocator>( begin(), end(), a );
|
|
}
|
|
|
|
#endif // nssv_CPP11_OR_GREATER
|
|
|
|
#endif // nssv_CONFIG_CONVERSION_STD_STRING_CLASS_METHODS
|
|
};
|
|
|
|
//
|
|
// Non-member functions:
|
|
//
|
|
|
|
// 24.4.3 Non-member comparison functions:
|
|
// lexicographically compare two string views (function template):
|
|
|
|
template< class CharT, class Traits >
|
|
nssv_constexpr bool operator== (
|
|
basic_string_view <CharT, Traits> lhs,
|
|
basic_string_view <CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
|
|
|
|
template< class CharT, class Traits >
|
|
nssv_constexpr bool operator!= (
|
|
basic_string_view <CharT, Traits> lhs,
|
|
basic_string_view <CharT, Traits> rhs ) nssv_noexcept
|
|
{ return !( lhs == rhs ); }
|
|
|
|
template< class CharT, class Traits >
|
|
nssv_constexpr bool operator< (
|
|
basic_string_view <CharT, Traits> lhs,
|
|
basic_string_view <CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) < 0; }
|
|
|
|
template< class CharT, class Traits >
|
|
nssv_constexpr bool operator<= (
|
|
basic_string_view <CharT, Traits> lhs,
|
|
basic_string_view <CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) <= 0; }
|
|
|
|
template< class CharT, class Traits >
|
|
nssv_constexpr bool operator> (
|
|
basic_string_view <CharT, Traits> lhs,
|
|
basic_string_view <CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) > 0; }
|
|
|
|
template< class CharT, class Traits >
|
|
nssv_constexpr bool operator>= (
|
|
basic_string_view <CharT, Traits> lhs,
|
|
basic_string_view <CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) >= 0; }
|
|
|
|
// Let S be basic_string_view<CharT, Traits>, and sv be an instance of S.
|
|
// Implementations shall provide sufficient additional overloads marked
|
|
// constexpr and noexcept so that an object t with an implicit conversion
|
|
// to S can be compared according to Table 67.
|
|
|
|
#if ! nssv_CPP11_OR_GREATER || nssv_BETWEEN( nssv_COMPILER_MSVC_VERSION, 100, 141 )
|
|
|
|
// accommodate for older compilers:
|
|
|
|
// ==
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator==(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
CharT const * rhs ) nssv_noexcept
|
|
{ return lhs.size() == detail::length( rhs ) && lhs.compare( rhs ) == 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator==(
|
|
CharT const * lhs,
|
|
basic_string_view<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return detail::length( lhs ) == rhs.size() && rhs.compare( lhs ) == 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator==(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
std::basic_string<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator==(
|
|
std::basic_string<CharT, Traits> rhs,
|
|
basic_string_view<CharT, Traits> lhs ) nssv_noexcept
|
|
{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
|
|
|
|
// !=
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator!=(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
CharT const * rhs ) nssv_noexcept
|
|
{ return !( lhs == rhs ); }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator!=(
|
|
CharT const * lhs,
|
|
basic_string_view<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return !( lhs == rhs ); }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator!=(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
std::basic_string<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return !( lhs == rhs ); }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator!=(
|
|
std::basic_string<CharT, Traits> rhs,
|
|
basic_string_view<CharT, Traits> lhs ) nssv_noexcept
|
|
{ return !( lhs == rhs ); }
|
|
|
|
// <
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
CharT const * rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) < 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<(
|
|
CharT const * lhs,
|
|
basic_string_view<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) > 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
std::basic_string<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) < 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<(
|
|
std::basic_string<CharT, Traits> rhs,
|
|
basic_string_view<CharT, Traits> lhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) > 0; }
|
|
|
|
// <=
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<=(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
CharT const * rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) <= 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<=(
|
|
CharT const * lhs,
|
|
basic_string_view<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) >= 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<=(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
std::basic_string<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) <= 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator<=(
|
|
std::basic_string<CharT, Traits> rhs,
|
|
basic_string_view<CharT, Traits> lhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) >= 0; }
|
|
|
|
// >
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
CharT const * rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) > 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>(
|
|
CharT const * lhs,
|
|
basic_string_view<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) < 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
std::basic_string<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) > 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>(
|
|
std::basic_string<CharT, Traits> rhs,
|
|
basic_string_view<CharT, Traits> lhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) < 0; }
|
|
|
|
// >=
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>=(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
CharT const * rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) >= 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>=(
|
|
CharT const * lhs,
|
|
basic_string_view<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) <= 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>=(
|
|
basic_string_view<CharT, Traits> lhs,
|
|
std::basic_string<CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) >= 0; }
|
|
|
|
template< class CharT, class Traits>
|
|
nssv_constexpr bool operator>=(
|
|
std::basic_string<CharT, Traits> rhs,
|
|
basic_string_view<CharT, Traits> lhs ) nssv_noexcept
|
|
{ return rhs.compare( lhs ) <= 0; }
|
|
|
|
#else // newer compilers:
|
|
|
|
#define nssv_BASIC_STRING_VIEW_I(T,U) typename std::decay< basic_string_view<T,U> >::type
|
|
|
|
#if defined(_MSC_VER) // issue 40
|
|
# define nssv_MSVC_ORDER(x) , int=x
|
|
#else
|
|
# define nssv_MSVC_ORDER(x) /*, int=x*/
|
|
#endif
|
|
|
|
// ==
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(1) >
|
|
nssv_constexpr bool operator==(
|
|
basic_string_view <CharT, Traits> lhs,
|
|
nssv_BASIC_STRING_VIEW_I(CharT, Traits) rhs ) nssv_noexcept
|
|
{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(2) >
|
|
nssv_constexpr bool operator==(
|
|
nssv_BASIC_STRING_VIEW_I(CharT, Traits) lhs,
|
|
basic_string_view <CharT, Traits> rhs ) nssv_noexcept
|
|
{ return lhs.size() == rhs.size() && lhs.compare( rhs ) == 0; }
|
|
|
|
// !=
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(1) >
|
|
nssv_constexpr bool operator!= (
|
|
basic_string_view < CharT, Traits > lhs,
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
|
|
{ return !( lhs == rhs ); }
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(2) >
|
|
nssv_constexpr bool operator!= (
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
|
|
basic_string_view < CharT, Traits > rhs ) nssv_noexcept
|
|
{ return !( lhs == rhs ); }
|
|
|
|
// <
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(1) >
|
|
nssv_constexpr bool operator< (
|
|
basic_string_view < CharT, Traits > lhs,
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) < 0; }
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(2) >
|
|
nssv_constexpr bool operator< (
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
|
|
basic_string_view < CharT, Traits > rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) < 0; }
|
|
|
|
// <=
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(1) >
|
|
nssv_constexpr bool operator<= (
|
|
basic_string_view < CharT, Traits > lhs,
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) <= 0; }
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(2) >
|
|
nssv_constexpr bool operator<= (
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
|
|
basic_string_view < CharT, Traits > rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) <= 0; }
|
|
|
|
// >
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(1) >
|
|
nssv_constexpr bool operator> (
|
|
basic_string_view < CharT, Traits > lhs,
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) > 0; }
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(2) >
|
|
nssv_constexpr bool operator> (
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
|
|
basic_string_view < CharT, Traits > rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) > 0; }
|
|
|
|
// >=
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(1) >
|
|
nssv_constexpr bool operator>= (
|
|
basic_string_view < CharT, Traits > lhs,
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) >= 0; }
|
|
|
|
template< class CharT, class Traits nssv_MSVC_ORDER(2) >
|
|
nssv_constexpr bool operator>= (
|
|
nssv_BASIC_STRING_VIEW_I( CharT, Traits ) lhs,
|
|
basic_string_view < CharT, Traits > rhs ) nssv_noexcept
|
|
{ return lhs.compare( rhs ) >= 0; }
|
|
|
|
#undef nssv_MSVC_ORDER
|
|
#undef nssv_BASIC_STRING_VIEW_I
|
|
|
|
#endif // compiler-dependent approach to comparisons
|
|
|
|
// 24.4.4 Inserters and extractors:
|
|
|
|
#if ! nssv_CONFIG_NO_STREAM_INSERTION
|
|
|
|
namespace detail {
|
|
|
|
template< class Stream >
|
|
void write_padding( Stream & os, std::streamsize n )
|
|
{
|
|
for ( std::streamsize i = 0; i < n; ++i )
|
|
os.rdbuf()->sputc( os.fill() );
|
|
}
|
|
|
|
template< class Stream, class View >
|
|
Stream & write_to_stream( Stream & os, View const & sv )
|
|
{
|
|
typename Stream::sentry sentry( os );
|
|
|
|
if ( !sentry )
|
|
return os;
|
|
|
|
const std::streamsize length = static_cast<std::streamsize>( sv.length() );
|
|
|
|
// Whether, and how, to pad:
|
|
const bool pad = ( length < os.width() );
|
|
const bool left_pad = pad && ( os.flags() & std::ios_base::adjustfield ) == std::ios_base::right;
|
|
|
|
if ( left_pad )
|
|
write_padding( os, os.width() - length );
|
|
|
|
// Write span characters:
|
|
os.rdbuf()->sputn( sv.begin(), length );
|
|
|
|
if ( pad && !left_pad )
|
|
write_padding( os, os.width() - length );
|
|
|
|
// Reset output stream width:
|
|
os.width( 0 );
|
|
|
|
return os;
|
|
}
|
|
|
|
} // namespace detail
|
|
|
|
template< class CharT, class Traits >
|
|
std::basic_ostream<CharT, Traits> &
|
|
operator<<(
|
|
std::basic_ostream<CharT, Traits>& os,
|
|
basic_string_view <CharT, Traits> sv )
|
|
{
|
|
return detail::write_to_stream( os, sv );
|
|
}
|
|
|
|
#endif // nssv_CONFIG_NO_STREAM_INSERTION
|
|
|
|
// Several typedefs for common character types are provided:
|
|
|
|
typedef basic_string_view<char> string_view;
|
|
typedef basic_string_view<wchar_t> wstring_view;
|
|
#if nssv_HAVE_WCHAR16_T
|
|
typedef basic_string_view<char16_t> u16string_view;
|
|
typedef basic_string_view<char32_t> u32string_view;
|
|
#endif
|
|
|
|
}} // namespace nonstd::sv_lite
|
|
|
|
//
|
|
// 24.4.6 Suffix for basic_string_view literals:
|
|
//
|
|
|
|
#if nssv_HAVE_USER_DEFINED_LITERALS
|
|
|
|
namespace nonstd {
|
|
nssv_inline_ns namespace literals {
|
|
nssv_inline_ns namespace string_view_literals {
|
|
|
|
#if nssv_CONFIG_STD_SV_OPERATOR && nssv_HAVE_STD_DEFINED_LITERALS
|
|
|
|
nssv_constexpr nonstd::sv_lite::string_view operator "" sv( const char* str, size_t len ) nssv_noexcept // (1)
|
|
{
|
|
return nonstd::sv_lite::string_view{ str, len };
|
|
}
|
|
|
|
nssv_constexpr nonstd::sv_lite::u16string_view operator "" sv( const char16_t* str, size_t len ) nssv_noexcept // (2)
|
|
{
|
|
return nonstd::sv_lite::u16string_view{ str, len };
|
|
}
|
|
|
|
nssv_constexpr nonstd::sv_lite::u32string_view operator "" sv( const char32_t* str, size_t len ) nssv_noexcept // (3)
|
|
{
|
|
return nonstd::sv_lite::u32string_view{ str, len };
|
|
}
|
|
|
|
nssv_constexpr nonstd::sv_lite::wstring_view operator "" sv( const wchar_t* str, size_t len ) nssv_noexcept // (4)
|
|
{
|
|
return nonstd::sv_lite::wstring_view{ str, len };
|
|
}
|
|
|
|
#endif // nssv_CONFIG_STD_SV_OPERATOR && nssv_HAVE_STD_DEFINED_LITERALS
|
|
|
|
#if nssv_CONFIG_USR_SV_OPERATOR
|
|
|
|
nssv_constexpr nonstd::sv_lite::string_view operator "" _sv( const char* str, size_t len ) nssv_noexcept // (1)
|
|
{
|
|
return nonstd::sv_lite::string_view{ str, len };
|
|
}
|
|
|
|
nssv_constexpr nonstd::sv_lite::u16string_view operator "" _sv( const char16_t* str, size_t len ) nssv_noexcept // (2)
|
|
{
|
|
return nonstd::sv_lite::u16string_view{ str, len };
|
|
}
|
|
|
|
nssv_constexpr nonstd::sv_lite::u32string_view operator "" _sv( const char32_t* str, size_t len ) nssv_noexcept // (3)
|
|
{
|
|
return nonstd::sv_lite::u32string_view{ str, len };
|
|
}
|
|
|
|
nssv_constexpr nonstd::sv_lite::wstring_view operator "" _sv( const wchar_t* str, size_t len ) nssv_noexcept // (4)
|
|
{
|
|
return nonstd::sv_lite::wstring_view{ str, len };
|
|
}
|
|
|
|
#endif // nssv_CONFIG_USR_SV_OPERATOR
|
|
|
|
}}} // namespace nonstd::literals::string_view_literals
|
|
|
|
#endif
|
|
|
|
//
|
|
// Extensions for std::string:
|
|
//
|
|
|
|
#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
|
|
|
|
namespace nonstd {
|
|
namespace sv_lite {
|
|
|
|
// Exclude MSVC 14 (19.00): it yields ambiguous to_string():
|
|
|
|
#if nssv_CPP11_OR_GREATER && nssv_COMPILER_MSVC_VERSION != 140
|
|
|
|
template< class CharT, class Traits, class Allocator = std::allocator<CharT> >
|
|
std::basic_string<CharT, Traits, Allocator>
|
|
to_string( basic_string_view<CharT, Traits> v, Allocator const & a = Allocator() )
|
|
{
|
|
return std::basic_string<CharT,Traits, Allocator>( v.begin(), v.end(), a );
|
|
}
|
|
|
|
#else
|
|
|
|
template< class CharT, class Traits >
|
|
std::basic_string<CharT, Traits>
|
|
to_string( basic_string_view<CharT, Traits> v )
|
|
{
|
|
return std::basic_string<CharT, Traits>( v.begin(), v.end() );
|
|
}
|
|
|
|
template< class CharT, class Traits, class Allocator >
|
|
std::basic_string<CharT, Traits, Allocator>
|
|
to_string( basic_string_view<CharT, Traits> v, Allocator const & a )
|
|
{
|
|
return std::basic_string<CharT, Traits, Allocator>( v.begin(), v.end(), a );
|
|
}
|
|
|
|
#endif // nssv_CPP11_OR_GREATER
|
|
|
|
template< class CharT, class Traits, class Allocator >
|
|
basic_string_view<CharT, Traits>
|
|
to_string_view( std::basic_string<CharT, Traits, Allocator> const & s )
|
|
{
|
|
return basic_string_view<CharT, Traits>( s.data(), s.size() );
|
|
}
|
|
|
|
}} // namespace nonstd::sv_lite
|
|
|
|
#endif // nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
|
|
|
|
//
|
|
// make types and algorithms available in namespace nonstd:
|
|
//
|
|
|
|
namespace nonstd {
|
|
|
|
using sv_lite::basic_string_view;
|
|
using sv_lite::string_view;
|
|
using sv_lite::wstring_view;
|
|
|
|
#if nssv_HAVE_WCHAR16_T
|
|
using sv_lite::u16string_view;
|
|
#endif
|
|
#if nssv_HAVE_WCHAR32_T
|
|
using sv_lite::u32string_view;
|
|
#endif
|
|
|
|
// literal "sv"
|
|
|
|
using sv_lite::operator==;
|
|
using sv_lite::operator!=;
|
|
using sv_lite::operator<;
|
|
using sv_lite::operator<=;
|
|
using sv_lite::operator>;
|
|
using sv_lite::operator>=;
|
|
|
|
#if ! nssv_CONFIG_NO_STREAM_INSERTION
|
|
using sv_lite::operator<<;
|
|
#endif
|
|
|
|
#if nssv_CONFIG_CONVERSION_STD_STRING_FREE_FUNCTIONS
|
|
using sv_lite::to_string;
|
|
using sv_lite::to_string_view;
|
|
#endif
|
|
|
|
} // namespace nonstd
|
|
|
|
// 24.4.5 Hash support (C++11):
|
|
|
|
// Note: The hash value of a string view object is equal to the hash value of
|
|
// the corresponding string object.
|
|
|
|
#if nssv_HAVE_STD_HASH
|
|
|
|
#include <functional>
|
|
|
|
namespace std {
|
|
|
|
template<>
|
|
struct hash< nonstd::string_view >
|
|
{
|
|
public:
|
|
std::size_t operator()( nonstd::string_view v ) const nssv_noexcept
|
|
{
|
|
return std::hash<std::string>()( std::string( v.data(), v.size() ) );
|
|
}
|
|
};
|
|
|
|
template<>
|
|
struct hash< nonstd::wstring_view >
|
|
{
|
|
public:
|
|
std::size_t operator()( nonstd::wstring_view v ) const nssv_noexcept
|
|
{
|
|
return std::hash<std::wstring>()( std::wstring( v.data(), v.size() ) );
|
|
}
|
|
};
|
|
|
|
template<>
|
|
struct hash< nonstd::u16string_view >
|
|
{
|
|
public:
|
|
std::size_t operator()( nonstd::u16string_view v ) const nssv_noexcept
|
|
{
|
|
return std::hash<std::u16string>()( std::u16string( v.data(), v.size() ) );
|
|
}
|
|
};
|
|
|
|
template<>
|
|
struct hash< nonstd::u32string_view >
|
|
{
|
|
public:
|
|
std::size_t operator()( nonstd::u32string_view v ) const nssv_noexcept
|
|
{
|
|
return std::hash<std::u32string>()( std::u32string( v.data(), v.size() ) );
|
|
}
|
|
};
|
|
|
|
} // namespace std
|
|
|
|
#endif // nssv_HAVE_STD_HASH
|
|
|
|
nssv_RESTORE_WARNINGS()
|
|
|
|
#endif // nssv_HAVE_STD_STRING_VIEW
|
|
#endif // NONSTD_SV_LITE_H_INCLUDED
|
|
/* end file simdjson/nonstd/string_view.hpp */
|
|
SIMDJSON_POP_DISABLE_WARNINGS
|
|
|
|
namespace std {
|
|
using string_view = nonstd::string_view;
|
|
}
|
|
#endif // SIMDJSON_HAS_STRING_VIEW
|
|
#undef SIMDJSON_HAS_STRING_VIEW // We are not going to need this macro anymore.
|
|
|
|
/// If EXPR is an error, returns it.
|
|
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
|
|
|
|
// Unless the programmer has already set SIMDJSON_DEVELOPMENT_CHECKS,
|
|
// we want to set it under debug builds. We detect a debug build
|
|
// under Visual Studio when the _DEBUG macro is set. Under the other
|
|
// compilers, we use the fact that they define __OPTIMIZE__ whenever
|
|
// they allow optimizations.
|
|
// It is possible that this could miss some cases where SIMDJSON_DEVELOPMENT_CHECKS
|
|
// is helpful, but the programmer can set the macro SIMDJSON_DEVELOPMENT_CHECKS.
|
|
// It could also wrongly set SIMDJSON_DEVELOPMENT_CHECKS (e.g., if the programmer
|
|
// sets _DEBUG in a release build under Visual Studio, or if some compiler fails to
|
|
// set the __OPTIMIZE__ macro).
|
|
#ifndef SIMDJSON_DEVELOPMENT_CHECKS
|
|
#ifdef _MSC_VER
|
|
// Visual Studio seems to set _DEBUG for debug builds.
|
|
#ifdef _DEBUG
|
|
#define SIMDJSON_DEVELOPMENT_CHECKS 1
|
|
#endif // _DEBUG
|
|
#else // _MSC_VER
|
|
// All other compilers appear to set __OPTIMIZE__ to a positive integer
|
|
// when the compiler is optimizing.
|
|
#ifndef __OPTIMIZE__
|
|
#define SIMDJSON_DEVELOPMENT_CHECKS 1
|
|
#endif // __OPTIMIZE__
|
|
#endif // _MSC_VER
|
|
#endif // SIMDJSON_DEVELOPMENT_CHECKS
|
|
|
|
// The SIMDJSON_CHECK_EOF macro is a feature flag for the "don't require padding"
|
|
// feature.
|
|
|
|
#if SIMDJSON_CPLUSPLUS17
|
|
// if we have C++, then fallthrough is a default attribute
|
|
# define simdjson_fallthrough [[fallthrough]]
|
|
// check if we have __attribute__ support
|
|
#elif defined(__has_attribute)
|
|
// check if we have the __fallthrough__ attribute
|
|
#if __has_attribute(__fallthrough__)
|
|
// we are good to go:
|
|
# define simdjson_fallthrough __attribute__((__fallthrough__))
|
|
#endif // __has_attribute(__fallthrough__)
|
|
#endif // SIMDJSON_CPLUSPLUS17
|
|
// on some systems, we simply do not have support for fallthrough, so use a default:
|
|
#ifndef simdjson_fallthrough
|
|
# define simdjson_fallthrough do {} while (0) /* fallthrough */
|
|
#endif // simdjson_fallthrough
|
|
|
|
#if SIMDJSON_DEVELOPMENT_CHECKS
|
|
#define SIMDJSON_DEVELOPMENT_ASSERT(expr) do { assert ((expr)); } while (0)
|
|
#else
|
|
#define SIMDJSON_DEVELOPMENT_ASSERT(expr) do { } while (0)
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_UTF8VALIDATION
|
|
#define SIMDJSON_UTF8VALIDATION 1
|
|
#endif
|
|
|
|
#ifdef __has_include
|
|
// How do we detect that a compiler supports vbmi2?
|
|
// For sure if the following header is found, we are ok?
|
|
#if __has_include(<avx512vbmi2intrin.h>)
|
|
#define SIMDJSON_COMPILER_SUPPORTS_VBMI2 1
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
#if _MSC_VER >= 1920
|
|
// Visual Studio 2019 and up support VBMI2 under x64 even if the header
|
|
// avx512vbmi2intrin.h is not found.
|
|
#define SIMDJSON_COMPILER_SUPPORTS_VBMI2 1
|
|
#endif
|
|
#endif
|
|
|
|
// By default, we allow AVX512.
|
|
#ifndef SIMDJSON_AVX512_ALLOWED
|
|
#define SIMDJSON_AVX512_ALLOWED 1
|
|
#endif
|
|
|
|
#endif // SIMDJSON_COMMON_DEFS_H
|
|
/* end file simdjson/common_defs.h */
|
|
/* skipped duplicate #include "simdjson/compiler_check.h" */
|
|
/* including simdjson/error.h: #include "simdjson/error.h" */
|
|
/* begin file simdjson/error.h */
|
|
#ifndef SIMDJSON_ERROR_H
|
|
#define SIMDJSON_ERROR_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
|
|
#include <string>
|
|
#include <ostream>
|
|
|
|
namespace simdjson {
|
|
|
|
/**
|
|
* All possible errors returned by simdjson. These error codes are subject to change
|
|
* and not all simdjson kernel returns the same error code given the same input: it is not
|
|
* well defined which error a given input should produce.
|
|
*
|
|
* Only SUCCESS evaluates to false as a Boolean. All other error codes will evaluate
|
|
* to true as a Boolean.
|
|
*/
|
|
enum error_code {
|
|
SUCCESS = 0, ///< No error
|
|
CAPACITY, ///< This parser can't support a document that big
|
|
MEMALLOC, ///< Error allocating memory, most likely out of memory
|
|
TAPE_ERROR, ///< Something went wrong, this is a generic error
|
|
DEPTH_ERROR, ///< Your document exceeds the user-specified depth limitation
|
|
STRING_ERROR, ///< Problem while parsing a string
|
|
T_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 't'
|
|
F_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'f'
|
|
N_ATOM_ERROR, ///< Problem while parsing an atom starting with the letter 'n'
|
|
NUMBER_ERROR, ///< Problem while parsing a number
|
|
BIGINT_ERROR, ///< The integer value exceeds 64 bits
|
|
UTF8_ERROR, ///< the input is not valid UTF-8
|
|
UNINITIALIZED, ///< unknown error, or uninitialized document
|
|
EMPTY, ///< no structural element found
|
|
UNESCAPED_CHARS, ///< found unescaped characters in a string.
|
|
UNCLOSED_STRING, ///< missing quote at the end
|
|
UNSUPPORTED_ARCHITECTURE, ///< unsupported architecture
|
|
INCORRECT_TYPE, ///< JSON element has a different type than user expected
|
|
NUMBER_OUT_OF_RANGE, ///< JSON number does not fit in 64 bits
|
|
INDEX_OUT_OF_BOUNDS, ///< JSON array index too large
|
|
NO_SUCH_FIELD, ///< JSON field not found in object
|
|
IO_ERROR, ///< Error reading a file
|
|
INVALID_JSON_POINTER, ///< Invalid JSON pointer reference
|
|
INVALID_URI_FRAGMENT, ///< Invalid URI fragment
|
|
UNEXPECTED_ERROR, ///< indicative of a bug in simdjson
|
|
PARSER_IN_USE, ///< parser is already in use.
|
|
OUT_OF_ORDER_ITERATION, ///< tried to iterate an array or object out of order (checked when SIMDJSON_DEVELOPMENT_CHECKS=1)
|
|
INSUFFICIENT_PADDING, ///< The JSON doesn't have enough padding for simdjson to safely parse it.
|
|
INCOMPLETE_ARRAY_OR_OBJECT, ///< The document ends early.
|
|
SCALAR_DOCUMENT_AS_VALUE, ///< A scalar document is treated as a value.
|
|
OUT_OF_BOUNDS, ///< Attempted to access location outside of document.
|
|
TRAILING_CONTENT, ///< Unexpected trailing content in the JSON input
|
|
NUM_ERROR_CODES
|
|
};
|
|
|
|
/**
|
|
* It is the convention throughout the code that the macro SIMDJSON_DEVELOPMENT_CHECKS determines whether
|
|
* we check for OUT_OF_ORDER_ITERATION. The logic behind it is that these errors only occurs when the code
|
|
* that was written while breaking some simdjson::ondemand requirement. They should not occur in released
|
|
* code after these issues were fixed.
|
|
*/
|
|
|
|
/**
|
|
* Get the error message for the given error code.
|
|
*
|
|
* dom::parser parser;
|
|
* dom::element doc;
|
|
* auto error = parser.parse("foo",3).get(doc);
|
|
* if (error) { printf("Error: %s\n", error_message(error)); }
|
|
*
|
|
* @return The error message.
|
|
*/
|
|
inline const char *error_message(error_code error) noexcept;
|
|
|
|
/**
|
|
* Write the error message to the output stream
|
|
*/
|
|
inline std::ostream& operator<<(std::ostream& out, error_code error) noexcept;
|
|
|
|
/**
|
|
* Exception thrown when an exception-supporting simdjson method is called
|
|
*/
|
|
struct simdjson_error : public std::exception {
|
|
/**
|
|
* Create an exception from a simdjson error code.
|
|
* @param error The error code
|
|
*/
|
|
simdjson_error(error_code error) noexcept : _error{error} { }
|
|
/** The error message */
|
|
const char *what() const noexcept { return error_message(error()); }
|
|
/** The error code */
|
|
error_code error() const noexcept { return _error; }
|
|
private:
|
|
/** The error code that was used */
|
|
error_code _error;
|
|
};
|
|
|
|
namespace internal {
|
|
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct simdjson_result_base : protected std::pair<T, error_code> {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline simdjson_result_base() noexcept;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
|
|
}; // struct simdjson_result_base
|
|
|
|
} // namespace internal
|
|
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*/
|
|
template<typename T>
|
|
struct simdjson_result : public internal::simdjson_result_base<T> {
|
|
/**
|
|
* @private Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline simdjson_result() noexcept;
|
|
/**
|
|
* @private Create a new successful result.
|
|
*/
|
|
simdjson_inline simdjson_result(T &&value) noexcept;
|
|
/**
|
|
* @private Create a new error result.
|
|
*/
|
|
simdjson_inline simdjson_result(error_code error_code) noexcept;
|
|
/**
|
|
* @private Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline simdjson_result(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
|
|
}; // struct simdjson_result
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
inline std::ostream& operator<<(std::ostream& out, simdjson_result<T> value) { return out << value.value(); }
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
#ifndef SIMDJSON_DISABLE_DEPRECATED_API
|
|
/**
|
|
* @deprecated This is an alias and will be removed, use error_code instead
|
|
*/
|
|
using ErrorValues [[deprecated("This is an alias and will be removed, use error_code instead")]] = error_code;
|
|
|
|
/**
|
|
* @deprecated Error codes should be stored and returned as `error_code`, use `error_message()` instead.
|
|
*/
|
|
[[deprecated("Error codes should be stored and returned as `error_code`, use `error_message()` instead.")]]
|
|
inline const std::string error_message(int error) noexcept;
|
|
#endif // SIMDJSON_DISABLE_DEPRECATED_API
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ERROR_H
|
|
/* end file simdjson/error.h */
|
|
/* skipped duplicate #include "simdjson/portability.h" */
|
|
|
|
/**
|
|
* @brief The top level simdjson namespace, containing everything the library provides.
|
|
*/
|
|
namespace simdjson {
|
|
|
|
SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS
|
|
|
|
/** The maximum document size supported by simdjson. */
|
|
constexpr size_t SIMDJSON_MAXSIZE_BYTES = 0xFFFFFFFF;
|
|
|
|
/**
|
|
* The amount of padding needed in a buffer to parse JSON.
|
|
*
|
|
* The input buf should be readable up to buf + SIMDJSON_PADDING
|
|
* this is a stopgap; there should be a better description of the
|
|
* main loop and its behavior that abstracts over this
|
|
* See https://github.com/simdjson/simdjson/issues/174
|
|
*/
|
|
constexpr size_t SIMDJSON_PADDING = 64;
|
|
|
|
/**
|
|
* By default, simdjson supports this many nested objects and arrays.
|
|
*
|
|
* This is the default for parser::max_depth().
|
|
*/
|
|
constexpr size_t DEFAULT_MAX_DEPTH = 1024;
|
|
|
|
SIMDJSON_POP_DISABLE_UNUSED_WARNINGS
|
|
|
|
class implementation;
|
|
struct padded_string;
|
|
class padded_string_view;
|
|
enum class stage1_mode;
|
|
|
|
namespace internal {
|
|
|
|
template<typename T>
|
|
class atomic_ptr;
|
|
class dom_parser_implementation;
|
|
class escape_json_string;
|
|
class tape_ref;
|
|
struct value128;
|
|
enum class tape_type;
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_BASE_H
|
|
/* end file simdjson/base.h */
|
|
|
|
#endif // SIMDJSON_SRC_BASE_H
|
|
/* end file base.h */
|
|
|
|
SIMDJSON_PUSH_DISABLE_UNUSED_WARNINGS
|
|
|
|
/* including to_chars.cpp: #include <to_chars.cpp> */
|
|
/* begin file to_chars.cpp */
|
|
#ifndef SIMDJSON_SRC_TO_CHARS_CPP
|
|
#define SIMDJSON_SRC_TO_CHARS_CPP
|
|
|
|
/* skipped duplicate #include <base.h> */
|
|
|
|
#include <cstring>
|
|
#include <cstdint>
|
|
#include <array>
|
|
#include <cmath>
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
/*!
|
|
implements the Grisu2 algorithm for binary to decimal floating-point
|
|
conversion.
|
|
Adapted from JSON for Modern C++
|
|
|
|
This implementation is a slightly modified version of the reference
|
|
implementation which may be obtained from
|
|
http://florian.loitsch.com/publications (bench.tar.gz).
|
|
The code is distributed under the MIT license, Copyright (c) 2009 Florian
|
|
Loitsch. For a detailed description of the algorithm see: [1] Loitsch, "Printing
|
|
Floating-Point Numbers Quickly and Accurately with Integers", Proceedings of the
|
|
ACM SIGPLAN 2010 Conference on Programming Language Design and Implementation,
|
|
PLDI 2010 [2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and
|
|
Accurately", Proceedings of the ACM SIGPLAN 1996 Conference on Programming
|
|
Language Design and Implementation, PLDI 1996
|
|
*/
|
|
namespace dtoa_impl {
|
|
|
|
template <typename Target, typename Source>
|
|
Target reinterpret_bits(const Source source) {
|
|
static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
|
|
|
|
Target target;
|
|
std::memcpy(&target, &source, sizeof(Source));
|
|
return target;
|
|
}
|
|
|
|
struct diyfp // f * 2^e
|
|
{
|
|
static constexpr int kPrecision = 64; // = q
|
|
|
|
std::uint64_t f = 0;
|
|
int e = 0;
|
|
|
|
constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
|
|
|
|
/*!
|
|
@brief returns x - y
|
|
@pre x.e == y.e and x.f >= y.f
|
|
*/
|
|
static diyfp sub(const diyfp &x, const diyfp &y) noexcept {
|
|
|
|
return {x.f - y.f, x.e};
|
|
}
|
|
|
|
/*!
|
|
@brief returns x * y
|
|
@note The result is rounded. (Only the upper q bits are returned.)
|
|
*/
|
|
static diyfp mul(const diyfp &x, const diyfp &y) noexcept {
|
|
static_assert(kPrecision == 64, "internal error");
|
|
|
|
// Computes:
|
|
// f = round((x.f * y.f) / 2^q)
|
|
// e = x.e + y.e + q
|
|
|
|
// Emulate the 64-bit * 64-bit multiplication:
|
|
//
|
|
// p = u * v
|
|
// = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
|
|
// = (u_lo v_lo ) + 2^32 ((u_lo v_hi ) + (u_hi v_lo )) +
|
|
// 2^64 (u_hi v_hi ) = (p0 ) + 2^32 ((p1 ) + (p2 ))
|
|
// + 2^64 (p3 ) = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo +
|
|
// 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3 ) =
|
|
// (p0_lo ) + 2^32 (p0_hi + p1_lo + p2_lo ) + 2^64 (p1_hi +
|
|
// p2_hi + p3) = (p0_lo ) + 2^32 (Q ) + 2^64 (H ) = (p0_lo ) +
|
|
// 2^32 (Q_lo + 2^32 Q_hi ) + 2^64 (H )
|
|
//
|
|
// (Since Q might be larger than 2^32 - 1)
|
|
//
|
|
// = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
|
|
//
|
|
// (Q_hi + H does not overflow a 64-bit int)
|
|
//
|
|
// = p_lo + 2^64 p_hi
|
|
|
|
const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
|
|
const std::uint64_t u_hi = x.f >> 32u;
|
|
const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
|
|
const std::uint64_t v_hi = y.f >> 32u;
|
|
|
|
const std::uint64_t p0 = u_lo * v_lo;
|
|
const std::uint64_t p1 = u_lo * v_hi;
|
|
const std::uint64_t p2 = u_hi * v_lo;
|
|
const std::uint64_t p3 = u_hi * v_hi;
|
|
|
|
const std::uint64_t p0_hi = p0 >> 32u;
|
|
const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
|
|
const std::uint64_t p1_hi = p1 >> 32u;
|
|
const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
|
|
const std::uint64_t p2_hi = p2 >> 32u;
|
|
|
|
std::uint64_t Q = p0_hi + p1_lo + p2_lo;
|
|
|
|
// The full product might now be computed as
|
|
//
|
|
// p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
|
|
// p_lo = p0_lo + (Q << 32)
|
|
//
|
|
// But in this particular case here, the full p_lo is not required.
|
|
// Effectively we only need to add the highest bit in p_lo to p_hi (and
|
|
// Q_hi + 1 does not overflow).
|
|
|
|
Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
|
|
|
|
const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
|
|
|
|
return {h, x.e + y.e + 64};
|
|
}
|
|
|
|
/*!
|
|
@brief normalize x such that the significand is >= 2^(q-1)
|
|
@pre x.f != 0
|
|
*/
|
|
static diyfp normalize(diyfp x) noexcept {
|
|
|
|
while ((x.f >> 63u) == 0) {
|
|
x.f <<= 1u;
|
|
x.e--;
|
|
}
|
|
|
|
return x;
|
|
}
|
|
|
|
/*!
|
|
@brief normalize x such that the result has the exponent E
|
|
@pre e >= x.e and the upper e - x.e bits of x.f must be zero.
|
|
*/
|
|
static diyfp normalize_to(const diyfp &x,
|
|
const int target_exponent) noexcept {
|
|
const int delta = x.e - target_exponent;
|
|
|
|
return {x.f << delta, target_exponent};
|
|
}
|
|
};
|
|
|
|
struct boundaries {
|
|
diyfp w;
|
|
diyfp minus;
|
|
diyfp plus;
|
|
};
|
|
|
|
/*!
|
|
Compute the (normalized) diyfp representing the input number 'value' and its
|
|
boundaries.
|
|
@pre value must be finite and positive
|
|
*/
|
|
template <typename FloatType> boundaries compute_boundaries(FloatType value) {
|
|
|
|
// Convert the IEEE representation into a diyfp.
|
|
//
|
|
// If v is denormal:
|
|
// value = 0.F * 2^(1 - bias) = ( F) * 2^(1 - bias - (p-1))
|
|
// If v is normalized:
|
|
// value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
|
|
|
|
static_assert(std::numeric_limits<FloatType>::is_iec559,
|
|
"internal error: dtoa_short requires an IEEE-754 "
|
|
"floating-point implementation");
|
|
|
|
constexpr int kPrecision =
|
|
std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
|
|
constexpr int kBias =
|
|
std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
|
|
constexpr int kMinExp = 1 - kBias;
|
|
constexpr std::uint64_t kHiddenBit = std::uint64_t{1}
|
|
<< (kPrecision - 1); // = 2^(p-1)
|
|
|
|
using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t,
|
|
std::uint64_t>::type;
|
|
|
|
const std::uint64_t bits = reinterpret_bits<bits_type>(value);
|
|
const std::uint64_t E = bits >> (kPrecision - 1);
|
|
const std::uint64_t F = bits & (kHiddenBit - 1);
|
|
|
|
const bool is_denormal = E == 0;
|
|
const diyfp v = is_denormal
|
|
? diyfp(F, kMinExp)
|
|
: diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
|
|
|
|
// Compute the boundaries m- and m+ of the floating-point value
|
|
// v = f * 2^e.
|
|
//
|
|
// Determine v- and v+, the floating-point predecessor and successor if v,
|
|
// respectively.
|
|
//
|
|
// v- = v - 2^e if f != 2^(p-1) or e == e_min (A)
|
|
// = v - 2^(e-1) if f == 2^(p-1) and e > e_min (B)
|
|
//
|
|
// v+ = v + 2^e
|
|
//
|
|
// Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
|
|
// between m- and m+ round to v, regardless of how the input rounding
|
|
// algorithm breaks ties.
|
|
//
|
|
// ---+-------------+-------------+-------------+-------------+--- (A)
|
|
// v- m- v m+ v+
|
|
//
|
|
// -----------------+------+------+-------------+-------------+--- (B)
|
|
// v- m- v m+ v+
|
|
|
|
const bool lower_boundary_is_closer = F == 0 && E > 1;
|
|
const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
|
|
const diyfp m_minus = lower_boundary_is_closer
|
|
? diyfp(4 * v.f - 1, v.e - 2) // (B)
|
|
: diyfp(2 * v.f - 1, v.e - 1); // (A)
|
|
|
|
// Determine the normalized w+ = m+.
|
|
const diyfp w_plus = diyfp::normalize(m_plus);
|
|
|
|
// Determine w- = m- such that e_(w-) = e_(w+).
|
|
const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
|
|
|
|
return {diyfp::normalize(v), w_minus, w_plus};
|
|
}
|
|
|
|
// Given normalized diyfp w, Grisu needs to find a (normalized) cached
|
|
// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
|
|
// within a certain range [alpha, gamma] (Definition 3.2 from [1])
|
|
//
|
|
// alpha <= e = e_c + e_w + q <= gamma
|
|
//
|
|
// or
|
|
//
|
|
// f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
|
|
// <= f_c * f_w * 2^gamma
|
|
//
|
|
// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
|
|
//
|
|
// 2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
|
|
//
|
|
// or
|
|
//
|
|
// 2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
|
|
//
|
|
// The choice of (alpha,gamma) determines the size of the table and the form of
|
|
// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
|
|
// in practice:
|
|
//
|
|
// The idea is to cut the number c * w = f * 2^e into two parts, which can be
|
|
// processed independently: An integral part p1, and a fractional part p2:
|
|
//
|
|
// f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
|
|
// = (f div 2^-e) + (f mod 2^-e) * 2^e
|
|
// = p1 + p2 * 2^e
|
|
//
|
|
// The conversion of p1 into decimal form requires a series of divisions and
|
|
// modulos by (a power of) 10. These operations are faster for 32-bit than for
|
|
// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
|
|
// achieved by choosing
|
|
//
|
|
// -e >= 32 or e <= -32 := gamma
|
|
//
|
|
// In order to convert the fractional part
|
|
//
|
|
// p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
|
|
//
|
|
// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
|
|
// d[-i] are extracted in order:
|
|
//
|
|
// (10 * p2) div 2^-e = d[-1]
|
|
// (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
|
|
//
|
|
// The multiplication by 10 must not overflow. It is sufficient to choose
|
|
//
|
|
// 10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
|
|
//
|
|
// Since p2 = f mod 2^-e < 2^-e,
|
|
//
|
|
// -e <= 60 or e >= -60 := alpha
|
|
|
|
constexpr int kAlpha = -60;
|
|
constexpr int kGamma = -32;
|
|
|
|
struct cached_power // c = f * 2^e ~= 10^k
|
|
{
|
|
std::uint64_t f;
|
|
int e;
|
|
int k;
|
|
};
|
|
|
|
/*!
|
|
For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
|
|
power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
|
|
satisfies (Definition 3.2 from [1])
|
|
alpha <= e_c + e + q <= gamma.
|
|
*/
|
|
inline cached_power get_cached_power_for_binary_exponent(int e) {
|
|
// Now
|
|
//
|
|
// alpha <= e_c + e + q <= gamma (1)
|
|
// ==> f_c * 2^alpha <= c * 2^e * 2^q
|
|
//
|
|
// and since the c's are normalized, 2^(q-1) <= f_c,
|
|
//
|
|
// ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
|
|
// ==> 2^(alpha - e - 1) <= c
|
|
//
|
|
// If c were an exact power of ten, i.e. c = 10^k, one may determine k as
|
|
//
|
|
// k = ceil( log_10( 2^(alpha - e - 1) ) )
|
|
// = ceil( (alpha - e - 1) * log_10(2) )
|
|
//
|
|
// From the paper:
|
|
// "In theory the result of the procedure could be wrong since c is rounded,
|
|
// and the computation itself is approximated [...]. In practice, however,
|
|
// this simple function is sufficient."
|
|
//
|
|
// For IEEE double precision floating-point numbers converted into
|
|
// normalized diyfp's w = f * 2^e, with q = 64,
|
|
//
|
|
// e >= -1022 (min IEEE exponent)
|
|
// -52 (p - 1)
|
|
// -52 (p - 1, possibly normalize denormal IEEE numbers)
|
|
// -11 (normalize the diyfp)
|
|
// = -1137
|
|
//
|
|
// and
|
|
//
|
|
// e <= +1023 (max IEEE exponent)
|
|
// -52 (p - 1)
|
|
// -11 (normalize the diyfp)
|
|
// = 960
|
|
//
|
|
// This binary exponent range [-1137,960] results in a decimal exponent
|
|
// range [-307,324]. One does not need to store a cached power for each
|
|
// k in this range. For each such k it suffices to find a cached power
|
|
// such that the exponent of the product lies in [alpha,gamma].
|
|
// This implies that the difference of the decimal exponents of adjacent
|
|
// table entries must be less than or equal to
|
|
//
|
|
// floor( (gamma - alpha) * log_10(2) ) = 8.
|
|
//
|
|
// (A smaller distance gamma-alpha would require a larger table.)
|
|
|
|
// NB:
|
|
// Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
|
|
|
|
constexpr int kCachedPowersMinDecExp = -300;
|
|
constexpr int kCachedPowersDecStep = 8;
|
|
|
|
static constexpr std::array<cached_power, 79> kCachedPowers = {{
|
|
{0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292},
|
|
{0xBE5691EF416BD60C, -1007, -284}, {0x8DD01FAD907FFC3C, -980, -276},
|
|
{0xD3515C2831559A83, -954, -268}, {0x9D71AC8FADA6C9B5, -927, -260},
|
|
{0xEA9C227723EE8BCB, -901, -252}, {0xAECC49914078536D, -874, -244},
|
|
{0x823C12795DB6CE57, -847, -236}, {0xC21094364DFB5637, -821, -228},
|
|
{0x9096EA6F3848984F, -794, -220}, {0xD77485CB25823AC7, -768, -212},
|
|
{0xA086CFCD97BF97F4, -741, -204}, {0xEF340A98172AACE5, -715, -196},
|
|
{0xB23867FB2A35B28E, -688, -188}, {0x84C8D4DFD2C63F3B, -661, -180},
|
|
{0xC5DD44271AD3CDBA, -635, -172}, {0x936B9FCEBB25C996, -608, -164},
|
|
{0xDBAC6C247D62A584, -582, -156}, {0xA3AB66580D5FDAF6, -555, -148},
|
|
{0xF3E2F893DEC3F126, -529, -140}, {0xB5B5ADA8AAFF80B8, -502, -132},
|
|
{0x87625F056C7C4A8B, -475, -124}, {0xC9BCFF6034C13053, -449, -116},
|
|
{0x964E858C91BA2655, -422, -108}, {0xDFF9772470297EBD, -396, -100},
|
|
{0xA6DFBD9FB8E5B88F, -369, -92}, {0xF8A95FCF88747D94, -343, -84},
|
|
{0xB94470938FA89BCF, -316, -76}, {0x8A08F0F8BF0F156B, -289, -68},
|
|
{0xCDB02555653131B6, -263, -60}, {0x993FE2C6D07B7FAC, -236, -52},
|
|
{0xE45C10C42A2B3B06, -210, -44}, {0xAA242499697392D3, -183, -36},
|
|
{0xFD87B5F28300CA0E, -157, -28}, {0xBCE5086492111AEB, -130, -20},
|
|
{0x8CBCCC096F5088CC, -103, -12}, {0xD1B71758E219652C, -77, -4},
|
|
{0x9C40000000000000, -50, 4}, {0xE8D4A51000000000, -24, 12},
|
|
{0xAD78EBC5AC620000, 3, 20}, {0x813F3978F8940984, 30, 28},
|
|
{0xC097CE7BC90715B3, 56, 36}, {0x8F7E32CE7BEA5C70, 83, 44},
|
|
{0xD5D238A4ABE98068, 109, 52}, {0x9F4F2726179A2245, 136, 60},
|
|
{0xED63A231D4C4FB27, 162, 68}, {0xB0DE65388CC8ADA8, 189, 76},
|
|
{0x83C7088E1AAB65DB, 216, 84}, {0xC45D1DF942711D9A, 242, 92},
|
|
{0x924D692CA61BE758, 269, 100}, {0xDA01EE641A708DEA, 295, 108},
|
|
{0xA26DA3999AEF774A, 322, 116}, {0xF209787BB47D6B85, 348, 124},
|
|
{0xB454E4A179DD1877, 375, 132}, {0x865B86925B9BC5C2, 402, 140},
|
|
{0xC83553C5C8965D3D, 428, 148}, {0x952AB45CFA97A0B3, 455, 156},
|
|
{0xDE469FBD99A05FE3, 481, 164}, {0xA59BC234DB398C25, 508, 172},
|
|
{0xF6C69A72A3989F5C, 534, 180}, {0xB7DCBF5354E9BECE, 561, 188},
|
|
{0x88FCF317F22241E2, 588, 196}, {0xCC20CE9BD35C78A5, 614, 204},
|
|
{0x98165AF37B2153DF, 641, 212}, {0xE2A0B5DC971F303A, 667, 220},
|
|
{0xA8D9D1535CE3B396, 694, 228}, {0xFB9B7CD9A4A7443C, 720, 236},
|
|
{0xBB764C4CA7A44410, 747, 244}, {0x8BAB8EEFB6409C1A, 774, 252},
|
|
{0xD01FEF10A657842C, 800, 260}, {0x9B10A4E5E9913129, 827, 268},
|
|
{0xE7109BFBA19C0C9D, 853, 276}, {0xAC2820D9623BF429, 880, 284},
|
|
{0x80444B5E7AA7CF85, 907, 292}, {0xBF21E44003ACDD2D, 933, 300},
|
|
{0x8E679C2F5E44FF8F, 960, 308}, {0xD433179D9C8CB841, 986, 316},
|
|
{0x9E19DB92B4E31BA9, 1013, 324},
|
|
}};
|
|
|
|
// This computation gives exactly the same results for k as
|
|
// k = ceil((kAlpha - e - 1) * 0.30102999566398114)
|
|
// for |e| <= 1500, but doesn't require floating-point operations.
|
|
// NB: log_10(2) ~= 78913 / 2^18
|
|
const int f = kAlpha - e - 1;
|
|
const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
|
|
|
|
const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) /
|
|
kCachedPowersDecStep;
|
|
|
|
const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
|
|
|
|
return cached;
|
|
}
|
|
|
|
/*!
|
|
For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
|
|
For n == 0, returns 1 and sets pow10 := 1.
|
|
*/
|
|
inline int find_largest_pow10(const std::uint32_t n, std::uint32_t &pow10) {
|
|
// LCOV_EXCL_START
|
|
if (n >= 1000000000) {
|
|
pow10 = 1000000000;
|
|
return 10;
|
|
}
|
|
// LCOV_EXCL_STOP
|
|
else if (n >= 100000000) {
|
|
pow10 = 100000000;
|
|
return 9;
|
|
} else if (n >= 10000000) {
|
|
pow10 = 10000000;
|
|
return 8;
|
|
} else if (n >= 1000000) {
|
|
pow10 = 1000000;
|
|
return 7;
|
|
} else if (n >= 100000) {
|
|
pow10 = 100000;
|
|
return 6;
|
|
} else if (n >= 10000) {
|
|
pow10 = 10000;
|
|
return 5;
|
|
} else if (n >= 1000) {
|
|
pow10 = 1000;
|
|
return 4;
|
|
} else if (n >= 100) {
|
|
pow10 = 100;
|
|
return 3;
|
|
} else if (n >= 10) {
|
|
pow10 = 10;
|
|
return 2;
|
|
} else {
|
|
pow10 = 1;
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
inline void grisu2_round(char *buf, int len, std::uint64_t dist,
|
|
std::uint64_t delta, std::uint64_t rest,
|
|
std::uint64_t ten_k) {
|
|
|
|
// <--------------------------- delta ---->
|
|
// <---- dist --------->
|
|
// --------------[------------------+-------------------]--------------
|
|
// M- w M+
|
|
//
|
|
// ten_k
|
|
// <------>
|
|
// <---- rest ---->
|
|
// --------------[------------------+----+--------------]--------------
|
|
// w V
|
|
// = buf * 10^k
|
|
//
|
|
// ten_k represents a unit-in-the-last-place in the decimal representation
|
|
// stored in buf.
|
|
// Decrement buf by ten_k while this takes buf closer to w.
|
|
|
|
// The tests are written in this order to avoid overflow in unsigned
|
|
// integer arithmetic.
|
|
|
|
while (rest < dist && delta - rest >= ten_k &&
|
|
(rest + ten_k < dist || dist - rest > rest + ten_k - dist)) {
|
|
buf[len - 1]--;
|
|
rest += ten_k;
|
|
}
|
|
}
|
|
|
|
/*!
|
|
Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
|
|
M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
|
|
*/
|
|
inline void grisu2_digit_gen(char *buffer, int &length, int &decimal_exponent,
|
|
diyfp M_minus, diyfp w, diyfp M_plus) {
|
|
static_assert(kAlpha >= -60, "internal error");
|
|
static_assert(kGamma <= -32, "internal error");
|
|
|
|
// Generates the digits (and the exponent) of a decimal floating-point
|
|
// number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
|
|
// w, M- and M+ share the same exponent e, which satisfies alpha <= e <=
|
|
// gamma.
|
|
//
|
|
// <--------------------------- delta ---->
|
|
// <---- dist --------->
|
|
// --------------[------------------+-------------------]--------------
|
|
// M- w M+
|
|
//
|
|
// Grisu2 generates the digits of M+ from left to right and stops as soon as
|
|
// V is in [M-,M+].
|
|
|
|
std::uint64_t delta =
|
|
diyfp::sub(M_plus, M_minus)
|
|
.f; // (significand of (M+ - M-), implicit exponent is e)
|
|
std::uint64_t dist =
|
|
diyfp::sub(M_plus, w)
|
|
.f; // (significand of (M+ - w ), implicit exponent is e)
|
|
|
|
// Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
|
|
//
|
|
// M+ = f * 2^e
|
|
// = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
|
|
// = ((p1 ) * 2^-e + (p2 )) * 2^e
|
|
// = p1 + p2 * 2^e
|
|
|
|
const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
|
|
|
|
auto p1 = static_cast<std::uint32_t>(
|
|
M_plus.f >>
|
|
-one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
|
|
std::uint64_t p2 = M_plus.f & (one.f - 1); // p2 = f mod 2^-e
|
|
|
|
// 1)
|
|
//
|
|
// Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
|
|
|
|
std::uint32_t pow10;
|
|
const int k = find_largest_pow10(p1, pow10);
|
|
|
|
// 10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
|
|
//
|
|
// p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
|
|
// = (d[k-1] ) * 10^(k-1) + (p1 mod 10^(k-1))
|
|
//
|
|
// M+ = p1 + p2 * 2^e
|
|
// = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1)) + p2 * 2^e
|
|
// = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
|
|
// = d[k-1] * 10^(k-1) + ( rest) * 2^e
|
|
//
|
|
// Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
|
|
//
|
|
// p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
|
|
//
|
|
// but stop as soon as
|
|
//
|
|
// rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
|
|
|
|
int n = k;
|
|
while (n > 0) {
|
|
// Invariants:
|
|
// M+ = buffer * 10^n + (p1 + p2 * 2^e) (buffer = 0 for n = k)
|
|
// pow10 = 10^(n-1) <= p1 < 10^n
|
|
//
|
|
const std::uint32_t d = p1 / pow10; // d = p1 div 10^(n-1)
|
|
const std::uint32_t r = p1 % pow10; // r = p1 mod 10^(n-1)
|
|
//
|
|
// M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
|
|
// = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
|
|
//
|
|
buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
|
|
//
|
|
// M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
|
|
//
|
|
p1 = r;
|
|
n--;
|
|
//
|
|
// M+ = buffer * 10^n + (p1 + p2 * 2^e)
|
|
// pow10 = 10^n
|
|
//
|
|
|
|
// Now check if enough digits have been generated.
|
|
// Compute
|
|
//
|
|
// p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
|
|
//
|
|
// Note:
|
|
// Since rest and delta share the same exponent e, it suffices to
|
|
// compare the significands.
|
|
const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
|
|
if (rest <= delta) {
|
|
// V = buffer * 10^n, with M- <= V <= M+.
|
|
|
|
decimal_exponent += n;
|
|
|
|
// We may now just stop. But instead look if the buffer could be
|
|
// decremented to bring V closer to w.
|
|
//
|
|
// pow10 = 10^n is now 1 ulp in the decimal representation V.
|
|
// The rounding procedure works with diyfp's with an implicit
|
|
// exponent of e.
|
|
//
|
|
// 10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
|
|
//
|
|
const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
|
|
grisu2_round(buffer, length, dist, delta, rest, ten_n);
|
|
|
|
return;
|
|
}
|
|
|
|
pow10 /= 10;
|
|
//
|
|
// pow10 = 10^(n-1) <= p1 < 10^n
|
|
// Invariants restored.
|
|
}
|
|
|
|
// 2)
|
|
//
|
|
// The digits of the integral part have been generated:
|
|
//
|
|
// M+ = d[k-1]...d[1]d[0] + p2 * 2^e
|
|
// = buffer + p2 * 2^e
|
|
//
|
|
// Now generate the digits of the fractional part p2 * 2^e.
|
|
//
|
|
// Note:
|
|
// No decimal point is generated: the exponent is adjusted instead.
|
|
//
|
|
// p2 actually represents the fraction
|
|
//
|
|
// p2 * 2^e
|
|
// = p2 / 2^-e
|
|
// = d[-1] / 10^1 + d[-2] / 10^2 + ...
|
|
//
|
|
// Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
|
|
//
|
|
// p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
|
|
// + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
|
|
//
|
|
// using
|
|
//
|
|
// 10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
|
|
// = ( d) * 2^-e + ( r)
|
|
//
|
|
// or
|
|
// 10^m * p2 * 2^e = d + r * 2^e
|
|
//
|
|
// i.e.
|
|
//
|
|
// M+ = buffer + p2 * 2^e
|
|
// = buffer + 10^-m * (d + r * 2^e)
|
|
// = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
|
|
//
|
|
// and stop as soon as 10^-m * r * 2^e <= delta * 2^e
|
|
|
|
int m = 0;
|
|
for (;;) {
|
|
// Invariant:
|
|
// M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...)
|
|
// * 2^e
|
|
// = buffer * 10^-m + 10^-m * (p2 )
|
|
// * 2^e = buffer * 10^-m + 10^-m * (1/10 * (10 * p2) ) * 2^e =
|
|
// buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e +
|
|
// (10*p2 mod 2^-e)) * 2^e
|
|
//
|
|
p2 *= 10;
|
|
const std::uint64_t d = p2 >> -one.e; // d = (10 * p2) div 2^-e
|
|
const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
|
|
//
|
|
// M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
|
|
// = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
|
|
// = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
|
|
//
|
|
buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
|
|
//
|
|
// M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
|
|
//
|
|
p2 = r;
|
|
m++;
|
|
//
|
|
// M+ = buffer * 10^-m + 10^-m * p2 * 2^e
|
|
// Invariant restored.
|
|
|
|
// Check if enough digits have been generated.
|
|
//
|
|
// 10^-m * p2 * 2^e <= delta * 2^e
|
|
// p2 * 2^e <= 10^m * delta * 2^e
|
|
// p2 <= 10^m * delta
|
|
delta *= 10;
|
|
dist *= 10;
|
|
if (p2 <= delta) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// V = buffer * 10^-m, with M- <= V <= M+.
|
|
|
|
decimal_exponent -= m;
|
|
|
|
// 1 ulp in the decimal representation is now 10^-m.
|
|
// Since delta and dist are now scaled by 10^m, we need to do the
|
|
// same with ulp in order to keep the units in sync.
|
|
//
|
|
// 10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
|
|
//
|
|
const std::uint64_t ten_m = one.f;
|
|
grisu2_round(buffer, length, dist, delta, p2, ten_m);
|
|
|
|
// By construction this algorithm generates the shortest possible decimal
|
|
// number (Loitsch, Theorem 6.2) which rounds back to w.
|
|
// For an input number of precision p, at least
|
|
//
|
|
// N = 1 + ceil(p * log_10(2))
|
|
//
|
|
// decimal digits are sufficient to identify all binary floating-point
|
|
// numbers (Matula, "In-and-Out conversions").
|
|
// This implies that the algorithm does not produce more than N decimal
|
|
// digits.
|
|
//
|
|
// N = 17 for p = 53 (IEEE double precision)
|
|
// N = 9 for p = 24 (IEEE single precision)
|
|
}
|
|
|
|
/*!
|
|
v = buf * 10^decimal_exponent
|
|
len is the length of the buffer (number of decimal digits)
|
|
The buffer must be large enough, i.e. >= max_digits10.
|
|
*/
|
|
inline void grisu2(char *buf, int &len, int &decimal_exponent, diyfp m_minus,
|
|
diyfp v, diyfp m_plus) {
|
|
|
|
// --------(-----------------------+-----------------------)-------- (A)
|
|
// m- v m+
|
|
//
|
|
// --------------------(-----------+-----------------------)-------- (B)
|
|
// m- v m+
|
|
//
|
|
// First scale v (and m- and m+) such that the exponent is in the range
|
|
// [alpha, gamma].
|
|
|
|
const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
|
|
|
|
const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
|
|
|
|
// The exponent of the products is = v.e + c_minus_k.e + q and is in the range
|
|
// [alpha,gamma]
|
|
const diyfp w = diyfp::mul(v, c_minus_k);
|
|
const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
|
|
const diyfp w_plus = diyfp::mul(m_plus, c_minus_k);
|
|
|
|
// ----(---+---)---------------(---+---)---------------(---+---)----
|
|
// w- w w+
|
|
// = c*m- = c*v = c*m+
|
|
//
|
|
// diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
|
|
// w+ are now off by a small amount.
|
|
// In fact:
|
|
//
|
|
// w - v * 10^k < 1 ulp
|
|
//
|
|
// To account for this inaccuracy, add resp. subtract 1 ulp.
|
|
//
|
|
// --------+---[---------------(---+---)---------------]---+--------
|
|
// w- M- w M+ w+
|
|
//
|
|
// Now any number in [M-, M+] (bounds included) will round to w when input,
|
|
// regardless of how the input rounding algorithm breaks ties.
|
|
//
|
|
// And digit_gen generates the shortest possible such number in [M-, M+].
|
|
// Note that this does not mean that Grisu2 always generates the shortest
|
|
// possible number in the interval (m-, m+).
|
|
const diyfp M_minus(w_minus.f + 1, w_minus.e);
|
|
const diyfp M_plus(w_plus.f - 1, w_plus.e);
|
|
|
|
decimal_exponent = -cached.k; // = -(-k) = k
|
|
|
|
grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
|
|
}
|
|
|
|
/*!
|
|
v = buf * 10^decimal_exponent
|
|
len is the length of the buffer (number of decimal digits)
|
|
The buffer must be large enough, i.e. >= max_digits10.
|
|
*/
|
|
template <typename FloatType>
|
|
void grisu2(char *buf, int &len, int &decimal_exponent, FloatType value) {
|
|
static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
|
|
"internal error: not enough precision");
|
|
|
|
// If the neighbors (and boundaries) of 'value' are always computed for
|
|
// double-precision numbers, all float's can be recovered using strtod (and
|
|
// strtof). However, the resulting decimal representations are not exactly
|
|
// "short".
|
|
//
|
|
// The documentation for 'std::to_chars'
|
|
// (https://en.cppreference.com/w/cpp/utility/to_chars) says "value is
|
|
// converted to a string as if by std::sprintf in the default ("C") locale"
|
|
// and since sprintf promotes float's to double's, I think this is exactly
|
|
// what 'std::to_chars' does. On the other hand, the documentation for
|
|
// 'std::to_chars' requires that "parsing the representation using the
|
|
// corresponding std::from_chars function recovers value exactly". That
|
|
// indicates that single precision floating-point numbers should be recovered
|
|
// using 'std::strtof'.
|
|
//
|
|
// NB: If the neighbors are computed for single-precision numbers, there is a
|
|
// single float
|
|
// (7.0385307e-26f) which can't be recovered using strtod. The resulting
|
|
// double precision value is off by 1 ulp.
|
|
#if 0
|
|
const boundaries w = compute_boundaries(static_cast<double>(value));
|
|
#else
|
|
const boundaries w = compute_boundaries(value);
|
|
#endif
|
|
|
|
grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
|
|
}
|
|
|
|
/*!
|
|
@brief appends a decimal representation of e to buf
|
|
@return a pointer to the element following the exponent.
|
|
@pre -1000 < e < 1000
|
|
*/
|
|
inline char *append_exponent(char *buf, int e) {
|
|
|
|
if (e < 0) {
|
|
e = -e;
|
|
*buf++ = '-';
|
|
} else {
|
|
*buf++ = '+';
|
|
}
|
|
|
|
auto k = static_cast<std::uint32_t>(e);
|
|
if (k < 10) {
|
|
// Always print at least two digits in the exponent.
|
|
// This is for compatibility with printf("%g").
|
|
*buf++ = '0';
|
|
*buf++ = static_cast<char>('0' + k);
|
|
} else if (k < 100) {
|
|
*buf++ = static_cast<char>('0' + k / 10);
|
|
k %= 10;
|
|
*buf++ = static_cast<char>('0' + k);
|
|
} else {
|
|
*buf++ = static_cast<char>('0' + k / 100);
|
|
k %= 100;
|
|
*buf++ = static_cast<char>('0' + k / 10);
|
|
k %= 10;
|
|
*buf++ = static_cast<char>('0' + k);
|
|
}
|
|
|
|
return buf;
|
|
}
|
|
|
|
/*!
|
|
@brief prettify v = buf * 10^decimal_exponent
|
|
If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
|
|
notation. Otherwise it will be printed in exponential notation.
|
|
@pre min_exp < 0
|
|
@pre max_exp > 0
|
|
*/
|
|
inline char *format_buffer(char *buf, int len, int decimal_exponent,
|
|
int min_exp, int max_exp) {
|
|
|
|
const int k = len;
|
|
const int n = len + decimal_exponent;
|
|
|
|
// v = buf * 10^(n-k)
|
|
// k is the length of the buffer (number of decimal digits)
|
|
// n is the position of the decimal point relative to the start of the buffer.
|
|
|
|
if (k <= n && n <= max_exp) {
|
|
// digits[000]
|
|
// len <= max_exp + 2
|
|
|
|
std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
|
|
// Make it look like a floating-point number (#362, #378)
|
|
buf[n + 0] = '.';
|
|
buf[n + 1] = '0';
|
|
return buf + (static_cast<size_t>(n)) + 2;
|
|
}
|
|
|
|
if (0 < n && n <= max_exp) {
|
|
// dig.its
|
|
// len <= max_digits10 + 1
|
|
std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n,
|
|
static_cast<size_t>(k) - static_cast<size_t>(n));
|
|
buf[n] = '.';
|
|
return buf + (static_cast<size_t>(k) + 1U);
|
|
}
|
|
|
|
if (min_exp < n && n <= 0) {
|
|
// 0.[000]digits
|
|
// len <= 2 + (-min_exp - 1) + max_digits10
|
|
|
|
std::memmove(buf + (2 + static_cast<size_t>(-n)), buf,
|
|
static_cast<size_t>(k));
|
|
buf[0] = '0';
|
|
buf[1] = '.';
|
|
std::memset(buf + 2, '0', static_cast<size_t>(-n));
|
|
return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
|
|
}
|
|
|
|
if (k == 1) {
|
|
// dE+123
|
|
// len <= 1 + 5
|
|
|
|
buf += 1;
|
|
} else {
|
|
// d.igitsE+123
|
|
// len <= max_digits10 + 1 + 5
|
|
|
|
std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
|
|
buf[1] = '.';
|
|
buf += 1 + static_cast<size_t>(k);
|
|
}
|
|
|
|
*buf++ = 'e';
|
|
return append_exponent(buf, n - 1);
|
|
}
|
|
|
|
} // namespace dtoa_impl
|
|
|
|
/*!
|
|
The format of the resulting decimal representation is similar to printf's %g
|
|
format. Returns an iterator pointing past-the-end of the decimal representation.
|
|
@note The input number must be finite, i.e. NaN's and Inf's are not supported.
|
|
@note The buffer must be large enough.
|
|
@note The result is NOT null-terminated.
|
|
*/
|
|
char *to_chars(char *first, const char *last, double value) {
|
|
static_cast<void>(last); // maybe unused - fix warning
|
|
bool negative = std::signbit(value);
|
|
if (negative) {
|
|
value = -value;
|
|
*first++ = '-';
|
|
}
|
|
|
|
if (value == 0) // +-0
|
|
{
|
|
*first++ = '0';
|
|
// Make it look like a floating-point number (#362, #378)
|
|
*first++ = '.';
|
|
*first++ = '0';
|
|
return first;
|
|
}
|
|
// Compute v = buffer * 10^decimal_exponent.
|
|
// The decimal digits are stored in the buffer, which needs to be interpreted
|
|
// as an unsigned decimal integer.
|
|
// len is the length of the buffer, i.e. the number of decimal digits.
|
|
int len = 0;
|
|
int decimal_exponent = 0;
|
|
dtoa_impl::grisu2(first, len, decimal_exponent, value);
|
|
// Format the buffer like printf("%.*g", prec, value)
|
|
constexpr int kMinExp = -4;
|
|
constexpr int kMaxExp = std::numeric_limits<double>::digits10;
|
|
|
|
return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp,
|
|
kMaxExp);
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_TO_CHARS_CPP
|
|
/* end file to_chars.cpp */
|
|
/* including from_chars.cpp: #include <from_chars.cpp> */
|
|
/* begin file from_chars.cpp */
|
|
#ifndef SIMDJSON_SRC_FROM_CHARS_CPP
|
|
#define SIMDJSON_SRC_FROM_CHARS_CPP
|
|
|
|
/* skipped duplicate #include <base.h> */
|
|
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
#include <limits>
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
/**
|
|
* The code in the internal::from_chars function is meant to handle the floating-point number parsing
|
|
* when we have more than 19 digits in the decimal mantissa. This should only be seen
|
|
* in adversarial scenarios: we do not expect production systems to even produce
|
|
* such floating-point numbers.
|
|
*
|
|
* The parser is based on work by Nigel Tao (at https://github.com/google/wuffs/)
|
|
* who credits Ken Thompson for the design (via a reference to the Go source
|
|
* code). See
|
|
* https://github.com/google/wuffs/blob/aa46859ea40c72516deffa1b146121952d6dfd3b/internal/cgen/base/floatconv-submodule-data.c
|
|
* https://github.com/google/wuffs/blob/46cd8105f47ca07ae2ba8e6a7818ef9c0df6c152/internal/cgen/base/floatconv-submodule-code.c
|
|
* It is probably not very fast but it is a fallback that should almost never be
|
|
* called in real life. Google Wuffs is published under APL 2.0.
|
|
**/
|
|
|
|
namespace {
|
|
constexpr uint32_t max_digits = 768;
|
|
constexpr int32_t decimal_point_range = 2047;
|
|
} // namespace
|
|
|
|
struct adjusted_mantissa {
|
|
uint64_t mantissa;
|
|
int power2;
|
|
adjusted_mantissa() : mantissa(0), power2(0) {}
|
|
};
|
|
|
|
struct decimal {
|
|
uint32_t num_digits;
|
|
int32_t decimal_point;
|
|
bool negative;
|
|
bool truncated;
|
|
uint8_t digits[max_digits];
|
|
};
|
|
|
|
template <typename T> struct binary_format {
|
|
static constexpr int mantissa_explicit_bits();
|
|
static constexpr int minimum_exponent();
|
|
static constexpr int infinite_power();
|
|
static constexpr int sign_index();
|
|
};
|
|
|
|
template <> constexpr int binary_format<double>::mantissa_explicit_bits() {
|
|
return 52;
|
|
}
|
|
|
|
template <> constexpr int binary_format<double>::minimum_exponent() {
|
|
return -1023;
|
|
}
|
|
template <> constexpr int binary_format<double>::infinite_power() {
|
|
return 0x7FF;
|
|
}
|
|
|
|
template <> constexpr int binary_format<double>::sign_index() { return 63; }
|
|
|
|
bool is_integer(char c) noexcept { return (c >= '0' && c <= '9'); }
|
|
|
|
// This should always succeed since it follows a call to parse_number.
|
|
decimal parse_decimal(const char *&p) noexcept {
|
|
decimal answer;
|
|
answer.num_digits = 0;
|
|
answer.decimal_point = 0;
|
|
answer.truncated = false;
|
|
answer.negative = (*p == '-');
|
|
if ((*p == '-') || (*p == '+')) {
|
|
++p;
|
|
}
|
|
|
|
while (*p == '0') {
|
|
++p;
|
|
}
|
|
while (is_integer(*p)) {
|
|
if (answer.num_digits < max_digits) {
|
|
answer.digits[answer.num_digits] = uint8_t(*p - '0');
|
|
}
|
|
answer.num_digits++;
|
|
++p;
|
|
}
|
|
if (*p == '.') {
|
|
++p;
|
|
const char *first_after_period = p;
|
|
// if we have not yet encountered a zero, we have to skip it as well
|
|
if (answer.num_digits == 0) {
|
|
// skip zeros
|
|
while (*p == '0') {
|
|
++p;
|
|
}
|
|
}
|
|
while (is_integer(*p)) {
|
|
if (answer.num_digits < max_digits) {
|
|
answer.digits[answer.num_digits] = uint8_t(*p - '0');
|
|
}
|
|
answer.num_digits++;
|
|
++p;
|
|
}
|
|
answer.decimal_point = int32_t(first_after_period - p);
|
|
}
|
|
if(answer.num_digits > 0) {
|
|
const char *preverse = p - 1;
|
|
int32_t trailing_zeros = 0;
|
|
while ((*preverse == '0') || (*preverse == '.')) {
|
|
if(*preverse == '0') { trailing_zeros++; };
|
|
--preverse;
|
|
}
|
|
answer.decimal_point += int32_t(answer.num_digits);
|
|
answer.num_digits -= uint32_t(trailing_zeros);
|
|
}
|
|
if(answer.num_digits > max_digits ) {
|
|
answer.num_digits = max_digits;
|
|
answer.truncated = true;
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
++p;
|
|
bool neg_exp = false;
|
|
if ('-' == *p) {
|
|
neg_exp = true;
|
|
++p;
|
|
} else if ('+' == *p) {
|
|
++p;
|
|
}
|
|
int32_t exp_number = 0; // exponential part
|
|
while (is_integer(*p)) {
|
|
uint8_t digit = uint8_t(*p - '0');
|
|
if (exp_number < 0x10000) {
|
|
exp_number = 10 * exp_number + digit;
|
|
}
|
|
++p;
|
|
}
|
|
answer.decimal_point += (neg_exp ? -exp_number : exp_number);
|
|
}
|
|
return answer;
|
|
}
|
|
|
|
// This should always succeed since it follows a call to parse_number.
|
|
// Will not read at or beyond the "end" pointer.
|
|
decimal parse_decimal(const char *&p, const char * end) noexcept {
|
|
decimal answer;
|
|
answer.num_digits = 0;
|
|
answer.decimal_point = 0;
|
|
answer.truncated = false;
|
|
if(p == end) { return answer; } // should never happen
|
|
answer.negative = (*p == '-');
|
|
if ((*p == '-') || (*p == '+')) {
|
|
++p;
|
|
}
|
|
|
|
while ((p != end) && (*p == '0')) {
|
|
++p;
|
|
}
|
|
while ((p != end) && is_integer(*p)) {
|
|
if (answer.num_digits < max_digits) {
|
|
answer.digits[answer.num_digits] = uint8_t(*p - '0');
|
|
}
|
|
answer.num_digits++;
|
|
++p;
|
|
}
|
|
if ((p != end) && (*p == '.')) {
|
|
++p;
|
|
if(p == end) { return answer; } // should never happen
|
|
const char *first_after_period = p;
|
|
// if we have not yet encountered a zero, we have to skip it as well
|
|
if (answer.num_digits == 0) {
|
|
// skip zeros
|
|
while (*p == '0') {
|
|
++p;
|
|
}
|
|
}
|
|
while ((p != end) && is_integer(*p)) {
|
|
if (answer.num_digits < max_digits) {
|
|
answer.digits[answer.num_digits] = uint8_t(*p - '0');
|
|
}
|
|
answer.num_digits++;
|
|
++p;
|
|
}
|
|
answer.decimal_point = int32_t(first_after_period - p);
|
|
}
|
|
if(answer.num_digits > 0) {
|
|
const char *preverse = p - 1;
|
|
int32_t trailing_zeros = 0;
|
|
while ((*preverse == '0') || (*preverse == '.')) {
|
|
if(*preverse == '0') { trailing_zeros++; };
|
|
--preverse;
|
|
}
|
|
answer.decimal_point += int32_t(answer.num_digits);
|
|
answer.num_digits -= uint32_t(trailing_zeros);
|
|
}
|
|
if(answer.num_digits > max_digits ) {
|
|
answer.num_digits = max_digits;
|
|
answer.truncated = true;
|
|
}
|
|
if ((p != end) && (('e' == *p) || ('E' == *p))) {
|
|
++p;
|
|
if(p == end) { return answer; } // should never happen
|
|
bool neg_exp = false;
|
|
if ('-' == *p) {
|
|
neg_exp = true;
|
|
++p;
|
|
} else if ('+' == *p) {
|
|
++p;
|
|
}
|
|
int32_t exp_number = 0; // exponential part
|
|
while ((p != end) && is_integer(*p)) {
|
|
uint8_t digit = uint8_t(*p - '0');
|
|
if (exp_number < 0x10000) {
|
|
exp_number = 10 * exp_number + digit;
|
|
}
|
|
++p;
|
|
}
|
|
answer.decimal_point += (neg_exp ? -exp_number : exp_number);
|
|
}
|
|
return answer;
|
|
}
|
|
|
|
namespace {
|
|
|
|
// remove all final zeroes
|
|
inline void trim(decimal &h) {
|
|
while ((h.num_digits > 0) && (h.digits[h.num_digits - 1] == 0)) {
|
|
h.num_digits--;
|
|
}
|
|
}
|
|
|
|
uint32_t number_of_digits_decimal_left_shift(decimal &h, uint32_t shift) {
|
|
shift &= 63;
|
|
const static uint16_t number_of_digits_decimal_left_shift_table[65] = {
|
|
0x0000, 0x0800, 0x0801, 0x0803, 0x1006, 0x1009, 0x100D, 0x1812, 0x1817,
|
|
0x181D, 0x2024, 0x202B, 0x2033, 0x203C, 0x2846, 0x2850, 0x285B, 0x3067,
|
|
0x3073, 0x3080, 0x388E, 0x389C, 0x38AB, 0x38BB, 0x40CC, 0x40DD, 0x40EF,
|
|
0x4902, 0x4915, 0x4929, 0x513E, 0x5153, 0x5169, 0x5180, 0x5998, 0x59B0,
|
|
0x59C9, 0x61E3, 0x61FD, 0x6218, 0x6A34, 0x6A50, 0x6A6D, 0x6A8B, 0x72AA,
|
|
0x72C9, 0x72E9, 0x7B0A, 0x7B2B, 0x7B4D, 0x8370, 0x8393, 0x83B7, 0x83DC,
|
|
0x8C02, 0x8C28, 0x8C4F, 0x9477, 0x949F, 0x94C8, 0x9CF2, 0x051C, 0x051C,
|
|
0x051C, 0x051C,
|
|
};
|
|
uint32_t x_a = number_of_digits_decimal_left_shift_table[shift];
|
|
uint32_t x_b = number_of_digits_decimal_left_shift_table[shift + 1];
|
|
uint32_t num_new_digits = x_a >> 11;
|
|
uint32_t pow5_a = 0x7FF & x_a;
|
|
uint32_t pow5_b = 0x7FF & x_b;
|
|
const static uint8_t
|
|
number_of_digits_decimal_left_shift_table_powers_of_5[0x051C] = {
|
|
5, 2, 5, 1, 2, 5, 6, 2, 5, 3, 1, 2, 5, 1, 5, 6, 2, 5, 7, 8, 1, 2, 5,
|
|
3, 9, 0, 6, 2, 5, 1, 9, 5, 3, 1, 2, 5, 9, 7, 6, 5, 6, 2, 5, 4, 8, 8,
|
|
2, 8, 1, 2, 5, 2, 4, 4, 1, 4, 0, 6, 2, 5, 1, 2, 2, 0, 7, 0, 3, 1, 2,
|
|
5, 6, 1, 0, 3, 5, 1, 5, 6, 2, 5, 3, 0, 5, 1, 7, 5, 7, 8, 1, 2, 5, 1,
|
|
5, 2, 5, 8, 7, 8, 9, 0, 6, 2, 5, 7, 6, 2, 9, 3, 9, 4, 5, 3, 1, 2, 5,
|
|
3, 8, 1, 4, 6, 9, 7, 2, 6, 5, 6, 2, 5, 1, 9, 0, 7, 3, 4, 8, 6, 3, 2,
|
|
8, 1, 2, 5, 9, 5, 3, 6, 7, 4, 3, 1, 6, 4, 0, 6, 2, 5, 4, 7, 6, 8, 3,
|
|
7, 1, 5, 8, 2, 0, 3, 1, 2, 5, 2, 3, 8, 4, 1, 8, 5, 7, 9, 1, 0, 1, 5,
|
|
6, 2, 5, 1, 1, 9, 2, 0, 9, 2, 8, 9, 5, 5, 0, 7, 8, 1, 2, 5, 5, 9, 6,
|
|
0, 4, 6, 4, 4, 7, 7, 5, 3, 9, 0, 6, 2, 5, 2, 9, 8, 0, 2, 3, 2, 2, 3,
|
|
8, 7, 6, 9, 5, 3, 1, 2, 5, 1, 4, 9, 0, 1, 1, 6, 1, 1, 9, 3, 8, 4, 7,
|
|
6, 5, 6, 2, 5, 7, 4, 5, 0, 5, 8, 0, 5, 9, 6, 9, 2, 3, 8, 2, 8, 1, 2,
|
|
5, 3, 7, 2, 5, 2, 9, 0, 2, 9, 8, 4, 6, 1, 9, 1, 4, 0, 6, 2, 5, 1, 8,
|
|
6, 2, 6, 4, 5, 1, 4, 9, 2, 3, 0, 9, 5, 7, 0, 3, 1, 2, 5, 9, 3, 1, 3,
|
|
2, 2, 5, 7, 4, 6, 1, 5, 4, 7, 8, 5, 1, 5, 6, 2, 5, 4, 6, 5, 6, 6, 1,
|
|
2, 8, 7, 3, 0, 7, 7, 3, 9, 2, 5, 7, 8, 1, 2, 5, 2, 3, 2, 8, 3, 0, 6,
|
|
4, 3, 6, 5, 3, 8, 6, 9, 6, 2, 8, 9, 0, 6, 2, 5, 1, 1, 6, 4, 1, 5, 3,
|
|
2, 1, 8, 2, 6, 9, 3, 4, 8, 1, 4, 4, 5, 3, 1, 2, 5, 5, 8, 2, 0, 7, 6,
|
|
6, 0, 9, 1, 3, 4, 6, 7, 4, 0, 7, 2, 2, 6, 5, 6, 2, 5, 2, 9, 1, 0, 3,
|
|
8, 3, 0, 4, 5, 6, 7, 3, 3, 7, 0, 3, 6, 1, 3, 2, 8, 1, 2, 5, 1, 4, 5,
|
|
5, 1, 9, 1, 5, 2, 2, 8, 3, 6, 6, 8, 5, 1, 8, 0, 6, 6, 4, 0, 6, 2, 5,
|
|
7, 2, 7, 5, 9, 5, 7, 6, 1, 4, 1, 8, 3, 4, 2, 5, 9, 0, 3, 3, 2, 0, 3,
|
|
1, 2, 5, 3, 6, 3, 7, 9, 7, 8, 8, 0, 7, 0, 9, 1, 7, 1, 2, 9, 5, 1, 6,
|
|
6, 0, 1, 5, 6, 2, 5, 1, 8, 1, 8, 9, 8, 9, 4, 0, 3, 5, 4, 5, 8, 5, 6,
|
|
4, 7, 5, 8, 3, 0, 0, 7, 8, 1, 2, 5, 9, 0, 9, 4, 9, 4, 7, 0, 1, 7, 7,
|
|
2, 9, 2, 8, 2, 3, 7, 9, 1, 5, 0, 3, 9, 0, 6, 2, 5, 4, 5, 4, 7, 4, 7,
|
|
3, 5, 0, 8, 8, 6, 4, 6, 4, 1, 1, 8, 9, 5, 7, 5, 1, 9, 5, 3, 1, 2, 5,
|
|
2, 2, 7, 3, 7, 3, 6, 7, 5, 4, 4, 3, 2, 3, 2, 0, 5, 9, 4, 7, 8, 7, 5,
|
|
9, 7, 6, 5, 6, 2, 5, 1, 1, 3, 6, 8, 6, 8, 3, 7, 7, 2, 1, 6, 1, 6, 0,
|
|
2, 9, 7, 3, 9, 3, 7, 9, 8, 8, 2, 8, 1, 2, 5, 5, 6, 8, 4, 3, 4, 1, 8,
|
|
8, 6, 0, 8, 0, 8, 0, 1, 4, 8, 6, 9, 6, 8, 9, 9, 4, 1, 4, 0, 6, 2, 5,
|
|
2, 8, 4, 2, 1, 7, 0, 9, 4, 3, 0, 4, 0, 4, 0, 0, 7, 4, 3, 4, 8, 4, 4,
|
|
9, 7, 0, 7, 0, 3, 1, 2, 5, 1, 4, 2, 1, 0, 8, 5, 4, 7, 1, 5, 2, 0, 2,
|
|
0, 0, 3, 7, 1, 7, 4, 2, 2, 4, 8, 5, 3, 5, 1, 5, 6, 2, 5, 7, 1, 0, 5,
|
|
4, 2, 7, 3, 5, 7, 6, 0, 1, 0, 0, 1, 8, 5, 8, 7, 1, 1, 2, 4, 2, 6, 7,
|
|
5, 7, 8, 1, 2, 5, 3, 5, 5, 2, 7, 1, 3, 6, 7, 8, 8, 0, 0, 5, 0, 0, 9,
|
|
2, 9, 3, 5, 5, 6, 2, 1, 3, 3, 7, 8, 9, 0, 6, 2, 5, 1, 7, 7, 6, 3, 5,
|
|
6, 8, 3, 9, 4, 0, 0, 2, 5, 0, 4, 6, 4, 6, 7, 7, 8, 1, 0, 6, 6, 8, 9,
|
|
4, 5, 3, 1, 2, 5, 8, 8, 8, 1, 7, 8, 4, 1, 9, 7, 0, 0, 1, 2, 5, 2, 3,
|
|
2, 3, 3, 8, 9, 0, 5, 3, 3, 4, 4, 7, 2, 6, 5, 6, 2, 5, 4, 4, 4, 0, 8,
|
|
9, 2, 0, 9, 8, 5, 0, 0, 6, 2, 6, 1, 6, 1, 6, 9, 4, 5, 2, 6, 6, 7, 2,
|
|
3, 6, 3, 2, 8, 1, 2, 5, 2, 2, 2, 0, 4, 4, 6, 0, 4, 9, 2, 5, 0, 3, 1,
|
|
3, 0, 8, 0, 8, 4, 7, 2, 6, 3, 3, 3, 6, 1, 8, 1, 6, 4, 0, 6, 2, 5, 1,
|
|
1, 1, 0, 2, 2, 3, 0, 2, 4, 6, 2, 5, 1, 5, 6, 5, 4, 0, 4, 2, 3, 6, 3,
|
|
1, 6, 6, 8, 0, 9, 0, 8, 2, 0, 3, 1, 2, 5, 5, 5, 5, 1, 1, 1, 5, 1, 2,
|
|
3, 1, 2, 5, 7, 8, 2, 7, 0, 2, 1, 1, 8, 1, 5, 8, 3, 4, 0, 4, 5, 4, 1,
|
|
0, 1, 5, 6, 2, 5, 2, 7, 7, 5, 5, 5, 7, 5, 6, 1, 5, 6, 2, 8, 9, 1, 3,
|
|
5, 1, 0, 5, 9, 0, 7, 9, 1, 7, 0, 2, 2, 7, 0, 5, 0, 7, 8, 1, 2, 5, 1,
|
|
3, 8, 7, 7, 7, 8, 7, 8, 0, 7, 8, 1, 4, 4, 5, 6, 7, 5, 5, 2, 9, 5, 3,
|
|
9, 5, 8, 5, 1, 1, 3, 5, 2, 5, 3, 9, 0, 6, 2, 5, 6, 9, 3, 8, 8, 9, 3,
|
|
9, 0, 3, 9, 0, 7, 2, 2, 8, 3, 7, 7, 6, 4, 7, 6, 9, 7, 9, 2, 5, 5, 6,
|
|
7, 6, 2, 6, 9, 5, 3, 1, 2, 5, 3, 4, 6, 9, 4, 4, 6, 9, 5, 1, 9, 5, 3,
|
|
6, 1, 4, 1, 8, 8, 8, 2, 3, 8, 4, 8, 9, 6, 2, 7, 8, 3, 8, 1, 3, 4, 7,
|
|
6, 5, 6, 2, 5, 1, 7, 3, 4, 7, 2, 3, 4, 7, 5, 9, 7, 6, 8, 0, 7, 0, 9,
|
|
4, 4, 1, 1, 9, 2, 4, 4, 8, 1, 3, 9, 1, 9, 0, 6, 7, 3, 8, 2, 8, 1, 2,
|
|
5, 8, 6, 7, 3, 6, 1, 7, 3, 7, 9, 8, 8, 4, 0, 3, 5, 4, 7, 2, 0, 5, 9,
|
|
6, 2, 2, 4, 0, 6, 9, 5, 9, 5, 3, 3, 6, 9, 1, 4, 0, 6, 2, 5,
|
|
};
|
|
const uint8_t *pow5 =
|
|
&number_of_digits_decimal_left_shift_table_powers_of_5[pow5_a];
|
|
uint32_t i = 0;
|
|
uint32_t n = pow5_b - pow5_a;
|
|
for (; i < n; i++) {
|
|
if (i >= h.num_digits) {
|
|
return num_new_digits - 1;
|
|
} else if (h.digits[i] == pow5[i]) {
|
|
continue;
|
|
} else if (h.digits[i] < pow5[i]) {
|
|
return num_new_digits - 1;
|
|
} else {
|
|
return num_new_digits;
|
|
}
|
|
}
|
|
return num_new_digits;
|
|
}
|
|
|
|
} // end of anonymous namespace
|
|
|
|
uint64_t round(decimal &h) {
|
|
if ((h.num_digits == 0) || (h.decimal_point < 0)) {
|
|
return 0;
|
|
} else if (h.decimal_point > 18) {
|
|
return UINT64_MAX;
|
|
}
|
|
// at this point, we know that h.decimal_point >= 0
|
|
uint32_t dp = uint32_t(h.decimal_point);
|
|
uint64_t n = 0;
|
|
for (uint32_t i = 0; i < dp; i++) {
|
|
n = (10 * n) + ((i < h.num_digits) ? h.digits[i] : 0);
|
|
}
|
|
bool round_up = false;
|
|
if (dp < h.num_digits) {
|
|
round_up = h.digits[dp] >= 5; // normally, we round up
|
|
// but we may need to round to even!
|
|
if ((h.digits[dp] == 5) && (dp + 1 == h.num_digits)) {
|
|
round_up = h.truncated || ((dp > 0) && (1 & h.digits[dp - 1]));
|
|
}
|
|
}
|
|
if (round_up) {
|
|
n++;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
// computes h * 2^-shift
|
|
void decimal_left_shift(decimal &h, uint32_t shift) {
|
|
if (h.num_digits == 0) {
|
|
return;
|
|
}
|
|
uint32_t num_new_digits = number_of_digits_decimal_left_shift(h, shift);
|
|
int32_t read_index = int32_t(h.num_digits - 1);
|
|
uint32_t write_index = h.num_digits - 1 + num_new_digits;
|
|
uint64_t n = 0;
|
|
|
|
while (read_index >= 0) {
|
|
n += uint64_t(h.digits[read_index]) << shift;
|
|
uint64_t quotient = n / 10;
|
|
uint64_t remainder = n - (10 * quotient);
|
|
if (write_index < max_digits) {
|
|
h.digits[write_index] = uint8_t(remainder);
|
|
} else if (remainder > 0) {
|
|
h.truncated = true;
|
|
}
|
|
n = quotient;
|
|
write_index--;
|
|
read_index--;
|
|
}
|
|
while (n > 0) {
|
|
uint64_t quotient = n / 10;
|
|
uint64_t remainder = n - (10 * quotient);
|
|
if (write_index < max_digits) {
|
|
h.digits[write_index] = uint8_t(remainder);
|
|
} else if (remainder > 0) {
|
|
h.truncated = true;
|
|
}
|
|
n = quotient;
|
|
write_index--;
|
|
}
|
|
h.num_digits += num_new_digits;
|
|
if (h.num_digits > max_digits) {
|
|
h.num_digits = max_digits;
|
|
}
|
|
h.decimal_point += int32_t(num_new_digits);
|
|
trim(h);
|
|
}
|
|
|
|
// computes h * 2^shift
|
|
void decimal_right_shift(decimal &h, uint32_t shift) {
|
|
uint32_t read_index = 0;
|
|
uint32_t write_index = 0;
|
|
|
|
uint64_t n = 0;
|
|
|
|
while ((n >> shift) == 0) {
|
|
if (read_index < h.num_digits) {
|
|
n = (10 * n) + h.digits[read_index++];
|
|
} else if (n == 0) {
|
|
return;
|
|
} else {
|
|
while ((n >> shift) == 0) {
|
|
n = 10 * n;
|
|
read_index++;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
h.decimal_point -= int32_t(read_index - 1);
|
|
if (h.decimal_point < -decimal_point_range) { // it is zero
|
|
h.num_digits = 0;
|
|
h.decimal_point = 0;
|
|
h.negative = false;
|
|
h.truncated = false;
|
|
return;
|
|
}
|
|
uint64_t mask = (uint64_t(1) << shift) - 1;
|
|
while (read_index < h.num_digits) {
|
|
uint8_t new_digit = uint8_t(n >> shift);
|
|
n = (10 * (n & mask)) + h.digits[read_index++];
|
|
h.digits[write_index++] = new_digit;
|
|
}
|
|
while (n > 0) {
|
|
uint8_t new_digit = uint8_t(n >> shift);
|
|
n = 10 * (n & mask);
|
|
if (write_index < max_digits) {
|
|
h.digits[write_index++] = new_digit;
|
|
} else if (new_digit > 0) {
|
|
h.truncated = true;
|
|
}
|
|
}
|
|
h.num_digits = write_index;
|
|
trim(h);
|
|
}
|
|
|
|
template <typename binary> adjusted_mantissa compute_float(decimal &d) {
|
|
adjusted_mantissa answer;
|
|
if (d.num_digits == 0) {
|
|
// should be zero
|
|
answer.power2 = 0;
|
|
answer.mantissa = 0;
|
|
return answer;
|
|
}
|
|
// At this point, going further, we can assume that d.num_digits > 0.
|
|
// We want to guard against excessive decimal point values because
|
|
// they can result in long running times. Indeed, we do
|
|
// shifts by at most 60 bits. We have that log(10**400)/log(2**60) ~= 22
|
|
// which is fine, but log(10**299995)/log(2**60) ~= 16609 which is not
|
|
// fine (runs for a long time).
|
|
//
|
|
if(d.decimal_point < -324) {
|
|
// We have something smaller than 1e-324 which is always zero
|
|
// in binary64 and binary32.
|
|
// It should be zero.
|
|
answer.power2 = 0;
|
|
answer.mantissa = 0;
|
|
return answer;
|
|
} else if(d.decimal_point >= 310) {
|
|
// We have something at least as large as 0.1e310 which is
|
|
// always infinite.
|
|
answer.power2 = binary::infinite_power();
|
|
answer.mantissa = 0;
|
|
return answer;
|
|
}
|
|
|
|
static const uint32_t max_shift = 60;
|
|
static const uint32_t num_powers = 19;
|
|
static const uint8_t powers[19] = {
|
|
0, 3, 6, 9, 13, 16, 19, 23, 26, 29, //
|
|
33, 36, 39, 43, 46, 49, 53, 56, 59, //
|
|
};
|
|
int32_t exp2 = 0;
|
|
while (d.decimal_point > 0) {
|
|
uint32_t n = uint32_t(d.decimal_point);
|
|
uint32_t shift = (n < num_powers) ? powers[n] : max_shift;
|
|
decimal_right_shift(d, shift);
|
|
if (d.decimal_point < -decimal_point_range) {
|
|
// should be zero
|
|
answer.power2 = 0;
|
|
answer.mantissa = 0;
|
|
return answer;
|
|
}
|
|
exp2 += int32_t(shift);
|
|
}
|
|
// We shift left toward [1/2 ... 1].
|
|
while (d.decimal_point <= 0) {
|
|
uint32_t shift;
|
|
if (d.decimal_point == 0) {
|
|
if (d.digits[0] >= 5) {
|
|
break;
|
|
}
|
|
shift = (d.digits[0] < 2) ? 2 : 1;
|
|
} else {
|
|
uint32_t n = uint32_t(-d.decimal_point);
|
|
shift = (n < num_powers) ? powers[n] : max_shift;
|
|
}
|
|
decimal_left_shift(d, shift);
|
|
if (d.decimal_point > decimal_point_range) {
|
|
// we want to get infinity:
|
|
answer.power2 = 0xFF;
|
|
answer.mantissa = 0;
|
|
return answer;
|
|
}
|
|
exp2 -= int32_t(shift);
|
|
}
|
|
// We are now in the range [1/2 ... 1] but the binary format uses [1 ... 2].
|
|
exp2--;
|
|
constexpr int32_t minimum_exponent = binary::minimum_exponent();
|
|
while ((minimum_exponent + 1) > exp2) {
|
|
uint32_t n = uint32_t((minimum_exponent + 1) - exp2);
|
|
if (n > max_shift) {
|
|
n = max_shift;
|
|
}
|
|
decimal_right_shift(d, n);
|
|
exp2 += int32_t(n);
|
|
}
|
|
if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
|
|
answer.power2 = binary::infinite_power();
|
|
answer.mantissa = 0;
|
|
return answer;
|
|
}
|
|
|
|
const int mantissa_size_in_bits = binary::mantissa_explicit_bits() + 1;
|
|
decimal_left_shift(d, mantissa_size_in_bits);
|
|
|
|
uint64_t mantissa = round(d);
|
|
// It is possible that we have an overflow, in which case we need
|
|
// to shift back.
|
|
if (mantissa >= (uint64_t(1) << mantissa_size_in_bits)) {
|
|
decimal_right_shift(d, 1);
|
|
exp2 += 1;
|
|
mantissa = round(d);
|
|
if ((exp2 - minimum_exponent) >= binary::infinite_power()) {
|
|
answer.power2 = binary::infinite_power();
|
|
answer.mantissa = 0;
|
|
return answer;
|
|
}
|
|
}
|
|
answer.power2 = exp2 - binary::minimum_exponent();
|
|
if (mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) {
|
|
answer.power2--;
|
|
}
|
|
answer.mantissa =
|
|
mantissa & ((uint64_t(1) << binary::mantissa_explicit_bits()) - 1);
|
|
return answer;
|
|
}
|
|
|
|
template <typename binary>
|
|
adjusted_mantissa parse_long_mantissa(const char *first) {
|
|
decimal d = parse_decimal(first);
|
|
return compute_float<binary>(d);
|
|
}
|
|
|
|
template <typename binary>
|
|
adjusted_mantissa parse_long_mantissa(const char *first, const char *end) {
|
|
decimal d = parse_decimal(first, end);
|
|
return compute_float<binary>(d);
|
|
}
|
|
|
|
double from_chars(const char *first) noexcept {
|
|
bool negative = first[0] == '-';
|
|
if (negative) {
|
|
first++;
|
|
}
|
|
adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first);
|
|
uint64_t word = am.mantissa;
|
|
word |= uint64_t(am.power2)
|
|
<< binary_format<double>::mantissa_explicit_bits();
|
|
word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
|
|
: word;
|
|
double value;
|
|
std::memcpy(&value, &word, sizeof(double));
|
|
return value;
|
|
}
|
|
|
|
|
|
double from_chars(const char *first, const char *end) noexcept {
|
|
bool negative = first[0] == '-';
|
|
if (negative) {
|
|
first++;
|
|
}
|
|
adjusted_mantissa am = parse_long_mantissa<binary_format<double>>(first, end);
|
|
uint64_t word = am.mantissa;
|
|
word |= uint64_t(am.power2)
|
|
<< binary_format<double>::mantissa_explicit_bits();
|
|
word = negative ? word | (uint64_t(1) << binary_format<double>::sign_index())
|
|
: word;
|
|
double value;
|
|
std::memcpy(&value, &word, sizeof(double));
|
|
return value;
|
|
}
|
|
|
|
} // internal
|
|
} // simdjson
|
|
|
|
#endif // SIMDJSON_SRC_FROM_CHARS_CPP
|
|
/* end file from_chars.cpp */
|
|
/* including internal/error_tables.cpp: #include <internal/error_tables.cpp> */
|
|
/* begin file internal/error_tables.cpp */
|
|
#ifndef SIMDJSON_SRC_ERROR_TABLES_CPP
|
|
#define SIMDJSON_SRC_ERROR_TABLES_CPP
|
|
|
|
/* including simdjson/internal/jsoncharutils_tables.h: #include <simdjson/internal/jsoncharutils_tables.h> */
|
|
/* begin file simdjson/internal/jsoncharutils_tables.h */
|
|
#ifndef SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H
|
|
#define SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
|
|
#ifdef JSON_TEST_STRINGS
|
|
void found_string(const uint8_t *buf, const uint8_t *parsed_begin,
|
|
const uint8_t *parsed_end);
|
|
void found_bad_string(const uint8_t *buf);
|
|
#endif
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
// structural chars here are
|
|
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
|
|
// we are also interested in the four whitespace characters
|
|
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
|
|
|
|
extern SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256];
|
|
extern SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256];
|
|
extern SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886];
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_JSONCHARUTILS_TABLES_H
|
|
/* end file simdjson/internal/jsoncharutils_tables.h */
|
|
/* including simdjson/error-inl.h: #include <simdjson/error-inl.h> */
|
|
/* begin file simdjson/error-inl.h */
|
|
#ifndef SIMDJSON_ERROR_INL_H
|
|
#define SIMDJSON_ERROR_INL_H
|
|
|
|
/* skipped duplicate #include "simdjson/error.h" */
|
|
|
|
#include <iostream>
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
// We store the error code so we can validate the error message is associated with the right code
|
|
struct error_code_info {
|
|
error_code code;
|
|
const char* message; // do not use a fancy std::string where a simple C string will do (no alloc, no destructor)
|
|
};
|
|
// These MUST match the codes in error_code. We check this constraint in basictests.
|
|
extern SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[];
|
|
} // namespace internal
|
|
|
|
|
|
inline const char *error_message(error_code error) noexcept {
|
|
// If you're using error_code, we're trusting you got it from the enum.
|
|
return internal::error_codes[int(error)].message;
|
|
}
|
|
|
|
// deprecated function
|
|
#ifndef SIMDJSON_DISABLE_DEPRECATED_API
|
|
inline const std::string error_message(int error) noexcept {
|
|
if (error < 0 || error >= error_code::NUM_ERROR_CODES) {
|
|
return internal::error_codes[UNEXPECTED_ERROR].message;
|
|
}
|
|
return internal::error_codes[error].message;
|
|
}
|
|
#endif // SIMDJSON_DISABLE_DEPRECATED_API
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, error_code error) noexcept {
|
|
return out << error_message(error);
|
|
}
|
|
|
|
namespace internal {
|
|
|
|
//
|
|
// internal::simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simdjson_result_base<T>::simdjson_result_base(T &&value, error_code error) noexcept
|
|
: std::pair<T, error_code>(std::forward<T>(value), error) {}
|
|
template<typename T>
|
|
simdjson_inline simdjson_result_base<T>::simdjson_result_base(error_code error) noexcept
|
|
: simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline simdjson_result_base<T>::simdjson_result_base(T &&value) noexcept
|
|
: simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
template<typename T>
|
|
simdjson_inline simdjson_result_base<T>::simdjson_result_base() noexcept
|
|
: simdjson_result_base(T{}, UNINITIALIZED) {}
|
|
|
|
} // namespace internal
|
|
|
|
///
|
|
/// simdjson_result<T> inline implementation
|
|
///
|
|
|
|
template<typename T>
|
|
simdjson_inline void simdjson_result<T>::tie(T &value, error_code &error) && noexcept {
|
|
std::forward<internal::simdjson_result_base<T>>(*this).tie(value, error);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code simdjson_result<T>::get(T &value) && noexcept {
|
|
return std::forward<internal::simdjson_result_base<T>>(*this).get(value);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code simdjson_result<T>::error() const noexcept {
|
|
return internal::simdjson_result_base<T>::error();
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& simdjson_result<T>::value() & noexcept(false) {
|
|
return internal::simdjson_result_base<T>::value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& simdjson_result<T>::value() && noexcept(false) {
|
|
return std::forward<internal::simdjson_result_base<T>>(*this).value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& simdjson_result<T>::take_value() && noexcept(false) {
|
|
return std::forward<internal::simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simdjson_result<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<internal::simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& simdjson_result<T>::value_unsafe() const& noexcept {
|
|
return internal::simdjson_result_base<T>::value_unsafe();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& simdjson_result<T>::value_unsafe() && noexcept {
|
|
return std::forward<internal::simdjson_result_base<T>>(*this).value_unsafe();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simdjson_result<T>::simdjson_result(T &&value, error_code error) noexcept
|
|
: internal::simdjson_result_base<T>(std::forward<T>(value), error) {}
|
|
template<typename T>
|
|
simdjson_inline simdjson_result<T>::simdjson_result(error_code error) noexcept
|
|
: internal::simdjson_result_base<T>(error) {}
|
|
template<typename T>
|
|
simdjson_inline simdjson_result<T>::simdjson_result(T &&value) noexcept
|
|
: internal::simdjson_result_base<T>(std::forward<T>(value)) {}
|
|
template<typename T>
|
|
simdjson_inline simdjson_result<T>::simdjson_result() noexcept
|
|
: internal::simdjson_result_base<T>() {}
|
|
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ERROR_INL_H
|
|
/* end file simdjson/error-inl.h */
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
SIMDJSON_DLLIMPORTEXPORT const error_code_info error_codes[] {
|
|
{ SUCCESS, "SUCCESS: No error" },
|
|
{ CAPACITY, "CAPACITY: This parser can't support a document that big" },
|
|
{ MEMALLOC, "MEMALLOC: Error allocating memory, we're most likely out of memory" },
|
|
{ TAPE_ERROR, "TAPE_ERROR: The JSON document has an improper structure: missing or superfluous commas, braces, missing keys, etc." },
|
|
{ DEPTH_ERROR, "DEPTH_ERROR: The JSON document was too deep (too many nested objects and arrays)" },
|
|
{ STRING_ERROR, "STRING_ERROR: Problem while parsing a string" },
|
|
{ T_ATOM_ERROR, "T_ATOM_ERROR: Problem while parsing an atom starting with the letter 't'" },
|
|
{ F_ATOM_ERROR, "F_ATOM_ERROR: Problem while parsing an atom starting with the letter 'f'" },
|
|
{ N_ATOM_ERROR, "N_ATOM_ERROR: Problem while parsing an atom starting with the letter 'n'" },
|
|
{ NUMBER_ERROR, "NUMBER_ERROR: Problem while parsing a number" },
|
|
{ BIGINT_ERROR, "BIGINT_ERROR: Big integer value that cannot be represented using 64 bits" },
|
|
{ UTF8_ERROR, "UTF8_ERROR: The input is not valid UTF-8" },
|
|
{ UNINITIALIZED, "UNINITIALIZED: Uninitialized" },
|
|
{ EMPTY, "EMPTY: no JSON found" },
|
|
{ UNESCAPED_CHARS, "UNESCAPED_CHARS: Within strings, some characters must be escaped, we found unescaped characters" },
|
|
{ UNCLOSED_STRING, "UNCLOSED_STRING: A string is opened, but never closed." },
|
|
{ UNSUPPORTED_ARCHITECTURE, "UNSUPPORTED_ARCHITECTURE: simdjson does not have an implementation supported by this CPU architecture. Please report this error to the core team as it should never happen." },
|
|
{ INCORRECT_TYPE, "INCORRECT_TYPE: The JSON element does not have the requested type." },
|
|
{ NUMBER_OUT_OF_RANGE, "NUMBER_OUT_OF_RANGE: The JSON number is too large or too small to fit within the requested type." },
|
|
{ INDEX_OUT_OF_BOUNDS, "INDEX_OUT_OF_BOUNDS: Attempted to access an element of a JSON array that is beyond its length." },
|
|
{ NO_SUCH_FIELD, "NO_SUCH_FIELD: The JSON field referenced does not exist in this object." },
|
|
{ IO_ERROR, "IO_ERROR: Error reading the file." },
|
|
{ INVALID_JSON_POINTER, "INVALID_JSON_POINTER: Invalid JSON pointer syntax." },
|
|
{ INVALID_URI_FRAGMENT, "INVALID_URI_FRAGMENT: Invalid URI fragment syntax." },
|
|
{ UNEXPECTED_ERROR, "UNEXPECTED_ERROR: Unexpected error, consider reporting this problem as you may have found a bug in simdjson" },
|
|
{ PARSER_IN_USE, "PARSER_IN_USE: Cannot parse a new document while a document is still in use." },
|
|
{ OUT_OF_ORDER_ITERATION, "OUT_OF_ORDER_ITERATION: Objects and arrays can only be iterated when they are first encountered." },
|
|
{ INSUFFICIENT_PADDING, "INSUFFICIENT_PADDING: simdjson requires the input JSON string to have at least SIMDJSON_PADDING extra bytes allocated, beyond the string's length. Consider using the simdjson::padded_string class if needed." },
|
|
{ INCOMPLETE_ARRAY_OR_OBJECT, "INCOMPLETE_ARRAY_OR_OBJECT: JSON document ended early in the middle of an object or array." },
|
|
{ SCALAR_DOCUMENT_AS_VALUE, "SCALAR_DOCUMENT_AS_VALUE: A JSON document made of a scalar (number, Boolean, null or string) is treated as a value. Use get_bool(), get_double(), etc. on the document instead. "},
|
|
{ OUT_OF_BOUNDS, "OUT_OF_BOUNDS: Attempt to access location outside of document."},
|
|
{ TRAILING_CONTENT, "TRAILING_CONTENT: Unexpected trailing content in the JSON input."}
|
|
}; // error_messages[]
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_ERROR_TABLES_CPP
|
|
/* end file internal/error_tables.cpp */
|
|
/* including internal/jsoncharutils_tables.cpp: #include <internal/jsoncharutils_tables.cpp> */
|
|
/* begin file internal/jsoncharutils_tables.cpp */
|
|
#ifndef SIMDJSON_SRC_JSONCHARUTILS_TABLES_CPP
|
|
#define SIMDJSON_SRC_JSONCHARUTILS_TABLES_CPP
|
|
|
|
/* skipped duplicate #include <simdjson/base.h> */
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
// structural chars here are
|
|
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
|
|
// we are also interested in the four whitespace characters
|
|
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
|
|
|
|
SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace_negated[256] = {
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
|
|
|
SIMDJSON_DLLIMPORTEXPORT const bool structural_or_whitespace[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
|
|
SIMDJSON_DLLIMPORTEXPORT const uint32_t digit_to_val32[886] = {
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0x0, 0x1, 0x2, 0x3, 0x4, 0x5,
|
|
0x6, 0x7, 0x8, 0x9, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa,
|
|
0xb, 0xc, 0xd, 0xe, 0xf, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xa, 0xb, 0xc, 0xd, 0xe,
|
|
0xf, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0x0, 0x10, 0x20, 0x30, 0x40, 0x50,
|
|
0x60, 0x70, 0x80, 0x90, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa0,
|
|
0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0,
|
|
0xf0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0x0, 0x100, 0x200, 0x300, 0x400, 0x500,
|
|
0x600, 0x700, 0x800, 0x900, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa00,
|
|
0xb00, 0xc00, 0xd00, 0xe00, 0xf00, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xa00, 0xb00, 0xc00, 0xd00, 0xe00,
|
|
0xf00, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0x0, 0x1000, 0x2000, 0x3000, 0x4000, 0x5000,
|
|
0x6000, 0x7000, 0x8000, 0x9000, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xa000,
|
|
0xb000, 0xc000, 0xd000, 0xe000, 0xf000, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xa000, 0xb000, 0xc000, 0xd000, 0xe000,
|
|
0xf000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
|
|
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_JSONCHARUTILS_TABLES_CPP
|
|
/* end file internal/jsoncharutils_tables.cpp */
|
|
/* including internal/numberparsing_tables.cpp: #include <internal/numberparsing_tables.cpp> */
|
|
/* begin file internal/numberparsing_tables.cpp */
|
|
#ifndef SIMDJSON_SRC_NUMBERPARSING_TABLES_CPP
|
|
#define SIMDJSON_SRC_NUMBERPARSING_TABLES_CPP
|
|
|
|
/* skipped duplicate #include <simdjson/base.h> */
|
|
/* including simdjson/internal/numberparsing_tables.h: #include <simdjson/internal/numberparsing_tables.h> */
|
|
/* begin file simdjson/internal/numberparsing_tables.h */
|
|
#ifndef SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H
|
|
#define SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
/**
|
|
* The smallest non-zero float (binary64) is 2^-1074.
|
|
* We take as input numbers of the form w x 10^q where w < 2^64.
|
|
* We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076.
|
|
* However, we have that
|
|
* (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^-1074.
|
|
* Thus it is possible for a number of the form w * 10^-342 where
|
|
* w is a 64-bit value to be a non-zero floating-point number.
|
|
*********
|
|
* Any number of form w * 10^309 where w>= 1 is going to be
|
|
* infinite in binary64 so we never need to worry about powers
|
|
* of 5 greater than 308.
|
|
*/
|
|
constexpr int smallest_power = -342;
|
|
constexpr int largest_power = 308;
|
|
|
|
/**
|
|
* Represents a 128-bit value.
|
|
* low: least significant 64 bits.
|
|
* high: most significant 64 bits.
|
|
*/
|
|
struct value128 {
|
|
uint64_t low;
|
|
uint64_t high;
|
|
};
|
|
|
|
|
|
// Precomputed powers of ten from 10^0 to 10^22. These
|
|
// can be represented exactly using the double type.
|
|
extern SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[];
|
|
|
|
|
|
/**
|
|
* When mapping numbers from decimal to binary,
|
|
* we go from w * 10^q to m * 2^p but we have
|
|
* 10^q = 5^q * 2^q, so effectively
|
|
* we are trying to match
|
|
* w * 2^q * 5^q to m * 2^p. Thus the powers of two
|
|
* are not a concern since they can be represented
|
|
* exactly using the binary notation, only the powers of five
|
|
* affect the binary significand.
|
|
*/
|
|
|
|
|
|
// The truncated powers of five from 5^-342 all the way to 5^308
|
|
// The mantissa is truncated to 128 bits, and
|
|
// never rounded up. Uses about 10KB.
|
|
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[];
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_NUMBERPARSING_TABLES_H
|
|
/* end file simdjson/internal/numberparsing_tables.h */
|
|
|
|
// Precomputed powers of ten from 10^0 to 10^22. These
|
|
// can be represented exactly using the double type.
|
|
SIMDJSON_DLLIMPORTEXPORT const double simdjson::internal::power_of_ten[] = {
|
|
1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11,
|
|
1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
|
|
|
|
/**
|
|
* When mapping numbers from decimal to binary,
|
|
* we go from w * 10^q to m * 2^p but we have
|
|
* 10^q = 5^q * 2^q, so effectively
|
|
* we are trying to match
|
|
* w * 2^q * 5^q to m * 2^p. Thus the powers of two
|
|
* are not a concern since they can be represented
|
|
* exactly using the binary notation, only the powers of five
|
|
* affect the binary significand.
|
|
*/
|
|
|
|
|
|
// The truncated powers of five from 5^-342 all the way to 5^308
|
|
// The mantissa is truncated to 128 bits, and
|
|
// never rounded up. Uses about 10KB.
|
|
SIMDJSON_DLLIMPORTEXPORT const uint64_t simdjson::internal::power_of_five_128[]= {
|
|
0xeef453d6923bd65a,0x113faa2906a13b3f,
|
|
0x9558b4661b6565f8,0x4ac7ca59a424c507,
|
|
0xbaaee17fa23ebf76,0x5d79bcf00d2df649,
|
|
0xe95a99df8ace6f53,0xf4d82c2c107973dc,
|
|
0x91d8a02bb6c10594,0x79071b9b8a4be869,
|
|
0xb64ec836a47146f9,0x9748e2826cdee284,
|
|
0xe3e27a444d8d98b7,0xfd1b1b2308169b25,
|
|
0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7,
|
|
0xb208ef855c969f4f,0xbdbd2d335e51a935,
|
|
0xde8b2b66b3bc4723,0xad2c788035e61382,
|
|
0x8b16fb203055ac76,0x4c3bcb5021afcc31,
|
|
0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d,
|
|
0xd953e8624b85dd78,0xd71d6dad34a2af0d,
|
|
0x87d4713d6f33aa6b,0x8672648c40e5ad68,
|
|
0xa9c98d8ccb009506,0x680efdaf511f18c2,
|
|
0xd43bf0effdc0ba48,0x212bd1b2566def2,
|
|
0x84a57695fe98746d,0x14bb630f7604b57,
|
|
0xa5ced43b7e3e9188,0x419ea3bd35385e2d,
|
|
0xcf42894a5dce35ea,0x52064cac828675b9,
|
|
0x818995ce7aa0e1b2,0x7343efebd1940993,
|
|
0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8,
|
|
0xca66fa129f9b60a6,0xd41a26e077774ef6,
|
|
0xfd00b897478238d0,0x8920b098955522b4,
|
|
0x9e20735e8cb16382,0x55b46e5f5d5535b0,
|
|
0xc5a890362fddbc62,0xeb2189f734aa831d,
|
|
0xf712b443bbd52b7b,0xa5e9ec7501d523e4,
|
|
0x9a6bb0aa55653b2d,0x47b233c92125366e,
|
|
0xc1069cd4eabe89f8,0x999ec0bb696e840a,
|
|
0xf148440a256e2c76,0xc00670ea43ca250d,
|
|
0x96cd2a865764dbca,0x380406926a5e5728,
|
|
0xbc807527ed3e12bc,0xc605083704f5ecf2,
|
|
0xeba09271e88d976b,0xf7864a44c633682e,
|
|
0x93445b8731587ea3,0x7ab3ee6afbe0211d,
|
|
0xb8157268fdae9e4c,0x5960ea05bad82964,
|
|
0xe61acf033d1a45df,0x6fb92487298e33bd,
|
|
0x8fd0c16206306bab,0xa5d3b6d479f8e056,
|
|
0xb3c4f1ba87bc8696,0x8f48a4899877186c,
|
|
0xe0b62e2929aba83c,0x331acdabfe94de87,
|
|
0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14,
|
|
0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9,
|
|
0xdb71e91432b1a24a,0xc9e82cd9f69d6150,
|
|
0x892731ac9faf056e,0xbe311c083a225cd2,
|
|
0xab70fe17c79ac6ca,0x6dbd630a48aaf406,
|
|
0xd64d3d9db981787d,0x92cbbccdad5b108,
|
|
0x85f0468293f0eb4e,0x25bbf56008c58ea5,
|
|
0xa76c582338ed2621,0xaf2af2b80af6f24e,
|
|
0xd1476e2c07286faa,0x1af5af660db4aee1,
|
|
0x82cca4db847945ca,0x50d98d9fc890ed4d,
|
|
0xa37fce126597973c,0xe50ff107bab528a0,
|
|
0xcc5fc196fefd7d0c,0x1e53ed49a96272c8,
|
|
0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a,
|
|
0x9faacf3df73609b1,0x77b191618c54e9ac,
|
|
0xc795830d75038c1d,0xd59df5b9ef6a2417,
|
|
0xf97ae3d0d2446f25,0x4b0573286b44ad1d,
|
|
0x9becce62836ac577,0x4ee367f9430aec32,
|
|
0xc2e801fb244576d5,0x229c41f793cda73f,
|
|
0xf3a20279ed56d48a,0x6b43527578c1110f,
|
|
0x9845418c345644d6,0x830a13896b78aaa9,
|
|
0xbe5691ef416bd60c,0x23cc986bc656d553,
|
|
0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8,
|
|
0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9,
|
|
0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53,
|
|
0xe858ad248f5c22c9,0xd1b3400f8f9cff68,
|
|
0x91376c36d99995be,0x23100809b9c21fa1,
|
|
0xb58547448ffffb2d,0xabd40a0c2832a78a,
|
|
0xe2e69915b3fff9f9,0x16c90c8f323f516c,
|
|
0x8dd01fad907ffc3b,0xae3da7d97f6792e3,
|
|
0xb1442798f49ffb4a,0x99cd11cfdf41779c,
|
|
0xdd95317f31c7fa1d,0x40405643d711d583,
|
|
0x8a7d3eef7f1cfc52,0x482835ea666b2572,
|
|
0xad1c8eab5ee43b66,0xda3243650005eecf,
|
|
0xd863b256369d4a40,0x90bed43e40076a82,
|
|
0x873e4f75e2224e68,0x5a7744a6e804a291,
|
|
0xa90de3535aaae202,0x711515d0a205cb36,
|
|
0xd3515c2831559a83,0xd5a5b44ca873e03,
|
|
0x8412d9991ed58091,0xe858790afe9486c2,
|
|
0xa5178fff668ae0b6,0x626e974dbe39a872,
|
|
0xce5d73ff402d98e3,0xfb0a3d212dc8128f,
|
|
0x80fa687f881c7f8e,0x7ce66634bc9d0b99,
|
|
0xa139029f6a239f72,0x1c1fffc1ebc44e80,
|
|
0xc987434744ac874e,0xa327ffb266b56220,
|
|
0xfbe9141915d7a922,0x4bf1ff9f0062baa8,
|
|
0x9d71ac8fada6c9b5,0x6f773fc3603db4a9,
|
|
0xc4ce17b399107c22,0xcb550fb4384d21d3,
|
|
0xf6019da07f549b2b,0x7e2a53a146606a48,
|
|
0x99c102844f94e0fb,0x2eda7444cbfc426d,
|
|
0xc0314325637a1939,0xfa911155fefb5308,
|
|
0xf03d93eebc589f88,0x793555ab7eba27ca,
|
|
0x96267c7535b763b5,0x4bc1558b2f3458de,
|
|
0xbbb01b9283253ca2,0x9eb1aaedfb016f16,
|
|
0xea9c227723ee8bcb,0x465e15a979c1cadc,
|
|
0x92a1958a7675175f,0xbfacd89ec191ec9,
|
|
0xb749faed14125d36,0xcef980ec671f667b,
|
|
0xe51c79a85916f484,0x82b7e12780e7401a,
|
|
0x8f31cc0937ae58d2,0xd1b2ecb8b0908810,
|
|
0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15,
|
|
0xdfbdcece67006ac9,0x67a791e093e1d49a,
|
|
0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0,
|
|
0xaecc49914078536d,0x58fae9f773886e18,
|
|
0xda7f5bf590966848,0xaf39a475506a899e,
|
|
0x888f99797a5e012d,0x6d8406c952429603,
|
|
0xaab37fd7d8f58178,0xc8e5087ba6d33b83,
|
|
0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64,
|
|
0x855c3be0a17fcd26,0x5cf2eea09a55067f,
|
|
0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e,
|
|
0xd0601d8efc57b08b,0xf13b94daf124da26,
|
|
0x823c12795db6ce57,0x76c53d08d6b70858,
|
|
0xa2cb1717b52481ed,0x54768c4b0c64ca6e,
|
|
0xcb7ddcdda26da268,0xa9942f5dcf7dfd09,
|
|
0xfe5d54150b090b02,0xd3f93b35435d7c4c,
|
|
0x9efa548d26e5a6e1,0xc47bc5014a1a6daf,
|
|
0xc6b8e9b0709f109a,0x359ab6419ca1091b,
|
|
0xf867241c8cc6d4c0,0xc30163d203c94b62,
|
|
0x9b407691d7fc44f8,0x79e0de63425dcf1d,
|
|
0xc21094364dfb5636,0x985915fc12f542e4,
|
|
0xf294b943e17a2bc4,0x3e6f5b7b17b2939d,
|
|
0x979cf3ca6cec5b5a,0xa705992ceecf9c42,
|
|
0xbd8430bd08277231,0x50c6ff782a838353,
|
|
0xece53cec4a314ebd,0xa4f8bf5635246428,
|
|
0x940f4613ae5ed136,0x871b7795e136be99,
|
|
0xb913179899f68584,0x28e2557b59846e3f,
|
|
0xe757dd7ec07426e5,0x331aeada2fe589cf,
|
|
0x9096ea6f3848984f,0x3ff0d2c85def7621,
|
|
0xb4bca50b065abe63,0xfed077a756b53a9,
|
|
0xe1ebce4dc7f16dfb,0xd3e8495912c62894,
|
|
0x8d3360f09cf6e4bd,0x64712dd7abbbd95c,
|
|
0xb080392cc4349dec,0xbd8d794d96aacfb3,
|
|
0xdca04777f541c567,0xecf0d7a0fc5583a0,
|
|
0x89e42caaf9491b60,0xf41686c49db57244,
|
|
0xac5d37d5b79b6239,0x311c2875c522ced5,
|
|
0xd77485cb25823ac7,0x7d633293366b828b,
|
|
0x86a8d39ef77164bc,0xae5dff9c02033197,
|
|
0xa8530886b54dbdeb,0xd9f57f830283fdfc,
|
|
0xd267caa862a12d66,0xd072df63c324fd7b,
|
|
0x8380dea93da4bc60,0x4247cb9e59f71e6d,
|
|
0xa46116538d0deb78,0x52d9be85f074e608,
|
|
0xcd795be870516656,0x67902e276c921f8b,
|
|
0x806bd9714632dff6,0xba1cd8a3db53b6,
|
|
0xa086cfcd97bf97f3,0x80e8a40eccd228a4,
|
|
0xc8a883c0fdaf7df0,0x6122cd128006b2cd,
|
|
0xfad2a4b13d1b5d6c,0x796b805720085f81,
|
|
0x9cc3a6eec6311a63,0xcbe3303674053bb0,
|
|
0xc3f490aa77bd60fc,0xbedbfc4411068a9c,
|
|
0xf4f1b4d515acb93b,0xee92fb5515482d44,
|
|
0x991711052d8bf3c5,0x751bdd152d4d1c4a,
|
|
0xbf5cd54678eef0b6,0xd262d45a78a0635d,
|
|
0xef340a98172aace4,0x86fb897116c87c34,
|
|
0x9580869f0e7aac0e,0xd45d35e6ae3d4da0,
|
|
0xbae0a846d2195712,0x8974836059cca109,
|
|
0xe998d258869facd7,0x2bd1a438703fc94b,
|
|
0x91ff83775423cc06,0x7b6306a34627ddcf,
|
|
0xb67f6455292cbf08,0x1a3bc84c17b1d542,
|
|
0xe41f3d6a7377eeca,0x20caba5f1d9e4a93,
|
|
0x8e938662882af53e,0x547eb47b7282ee9c,
|
|
0xb23867fb2a35b28d,0xe99e619a4f23aa43,
|
|
0xdec681f9f4c31f31,0x6405fa00e2ec94d4,
|
|
0x8b3c113c38f9f37e,0xde83bc408dd3dd04,
|
|
0xae0b158b4738705e,0x9624ab50b148d445,
|
|
0xd98ddaee19068c76,0x3badd624dd9b0957,
|
|
0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6,
|
|
0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c,
|
|
0xd47487cc8470652b,0x7647c3200069671f,
|
|
0x84c8d4dfd2c63f3b,0x29ecd9f40041e073,
|
|
0xa5fb0a17c777cf09,0xf468107100525890,
|
|
0xcf79cc9db955c2cc,0x7182148d4066eeb4,
|
|
0x81ac1fe293d599bf,0xc6f14cd848405530,
|
|
0xa21727db38cb002f,0xb8ada00e5a506a7c,
|
|
0xca9cf1d206fdc03b,0xa6d90811f0e4851c,
|
|
0xfd442e4688bd304a,0x908f4a166d1da663,
|
|
0x9e4a9cec15763e2e,0x9a598e4e043287fe,
|
|
0xc5dd44271ad3cdba,0x40eff1e1853f29fd,
|
|
0xf7549530e188c128,0xd12bee59e68ef47c,
|
|
0x9a94dd3e8cf578b9,0x82bb74f8301958ce,
|
|
0xc13a148e3032d6e7,0xe36a52363c1faf01,
|
|
0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1,
|
|
0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9,
|
|
0xbcb2b812db11a5de,0x7415d448f6b6f0e7,
|
|
0xebdf661791d60f56,0x111b495b3464ad21,
|
|
0x936b9fcebb25c995,0xcab10dd900beec34,
|
|
0xb84687c269ef3bfb,0x3d5d514f40eea742,
|
|
0xe65829b3046b0afa,0xcb4a5a3112a5112,
|
|
0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab,
|
|
0xb3f4e093db73a093,0x59ed216765690f56,
|
|
0xe0f218b8d25088b8,0x306869c13ec3532c,
|
|
0x8c974f7383725573,0x1e414218c73a13fb,
|
|
0xafbd2350644eeacf,0xe5d1929ef90898fa,
|
|
0xdbac6c247d62a583,0xdf45f746b74abf39,
|
|
0x894bc396ce5da772,0x6b8bba8c328eb783,
|
|
0xab9eb47c81f5114f,0x66ea92f3f326564,
|
|
0xd686619ba27255a2,0xc80a537b0efefebd,
|
|
0x8613fd0145877585,0xbd06742ce95f5f36,
|
|
0xa798fc4196e952e7,0x2c48113823b73704,
|
|
0xd17f3b51fca3a7a0,0xf75a15862ca504c5,
|
|
0x82ef85133de648c4,0x9a984d73dbe722fb,
|
|
0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba,
|
|
0xcc963fee10b7d1b3,0x318df905079926a8,
|
|
0xffbbcfe994e5c61f,0xfdf17746497f7052,
|
|
0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633,
|
|
0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0,
|
|
0xf9bd690a1b68637b,0x3dfdce7aa3c673b0,
|
|
0x9c1661a651213e2d,0x6bea10ca65c084e,
|
|
0xc31bfa0fe5698db8,0x486e494fcff30a62,
|
|
0xf3e2f893dec3f126,0x5a89dba3c3efccfa,
|
|
0x986ddb5c6b3a76b7,0xf89629465a75e01c,
|
|
0xbe89523386091465,0xf6bbb397f1135823,
|
|
0xee2ba6c0678b597f,0x746aa07ded582e2c,
|
|
0x94db483840b717ef,0xa8c2a44eb4571cdc,
|
|
0xba121a4650e4ddeb,0x92f34d62616ce413,
|
|
0xe896a0d7e51e1566,0x77b020baf9c81d17,
|
|
0x915e2486ef32cd60,0xace1474dc1d122e,
|
|
0xb5b5ada8aaff80b8,0xd819992132456ba,
|
|
0xe3231912d5bf60e6,0x10e1fff697ed6c69,
|
|
0x8df5efabc5979c8f,0xca8d3ffa1ef463c1,
|
|
0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2,
|
|
0xddd0467c64bce4a0,0xac7cb3f6d05ddbde,
|
|
0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b,
|
|
0xad4ab7112eb3929d,0x86c16c98d2c953c6,
|
|
0xd89d64d57a607744,0xe871c7bf077ba8b7,
|
|
0x87625f056c7c4a8b,0x11471cd764ad4972,
|
|
0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf,
|
|
0xd389b47879823479,0x4aff1d108d4ec2c3,
|
|
0x843610cb4bf160cb,0xcedf722a585139ba,
|
|
0xa54394fe1eedb8fe,0xc2974eb4ee658828,
|
|
0xce947a3da6a9273e,0x733d226229feea32,
|
|
0x811ccc668829b887,0x806357d5a3f525f,
|
|
0xa163ff802a3426a8,0xca07c2dcb0cf26f7,
|
|
0xc9bcff6034c13052,0xfc89b393dd02f0b5,
|
|
0xfc2c3f3841f17c67,0xbbac2078d443ace2,
|
|
0x9d9ba7832936edc0,0xd54b944b84aa4c0d,
|
|
0xc5029163f384a931,0xa9e795e65d4df11,
|
|
0xf64335bcf065d37d,0x4d4617b5ff4a16d5,
|
|
0x99ea0196163fa42e,0x504bced1bf8e4e45,
|
|
0xc06481fb9bcf8d39,0xe45ec2862f71e1d6,
|
|
0xf07da27a82c37088,0x5d767327bb4e5a4c,
|
|
0x964e858c91ba2655,0x3a6a07f8d510f86f,
|
|
0xbbe226efb628afea,0x890489f70a55368b,
|
|
0xeadab0aba3b2dbe5,0x2b45ac74ccea842e,
|
|
0x92c8ae6b464fc96f,0x3b0b8bc90012929d,
|
|
0xb77ada0617e3bbcb,0x9ce6ebb40173744,
|
|
0xe55990879ddcaabd,0xcc420a6a101d0515,
|
|
0x8f57fa54c2a9eab6,0x9fa946824a12232d,
|
|
0xb32df8e9f3546564,0x47939822dc96abf9,
|
|
0xdff9772470297ebd,0x59787e2b93bc56f7,
|
|
0x8bfbea76c619ef36,0x57eb4edb3c55b65a,
|
|
0xaefae51477a06b03,0xede622920b6b23f1,
|
|
0xdab99e59958885c4,0xe95fab368e45eced,
|
|
0x88b402f7fd75539b,0x11dbcb0218ebb414,
|
|
0xaae103b5fcd2a881,0xd652bdc29f26a119,
|
|
0xd59944a37c0752a2,0x4be76d3346f0495f,
|
|
0x857fcae62d8493a5,0x6f70a4400c562ddb,
|
|
0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952,
|
|
0xd097ad07a71f26b2,0x7e2000a41346a7a7,
|
|
0x825ecc24c873782f,0x8ed400668c0c28c8,
|
|
0xa2f67f2dfa90563b,0x728900802f0f32fa,
|
|
0xcbb41ef979346bca,0x4f2b40a03ad2ffb9,
|
|
0xfea126b7d78186bc,0xe2f610c84987bfa8,
|
|
0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9,
|
|
0xc6ede63fa05d3143,0x91503d1c79720dbb,
|
|
0xf8a95fcf88747d94,0x75a44c6397ce912a,
|
|
0x9b69dbe1b548ce7c,0xc986afbe3ee11aba,
|
|
0xc24452da229b021b,0xfbe85badce996168,
|
|
0xf2d56790ab41c2a2,0xfae27299423fb9c3,
|
|
0x97c560ba6b0919a5,0xdccd879fc967d41a,
|
|
0xbdb6b8e905cb600f,0x5400e987bbc1c920,
|
|
0xed246723473e3813,0x290123e9aab23b68,
|
|
0x9436c0760c86e30b,0xf9a0b6720aaf6521,
|
|
0xb94470938fa89bce,0xf808e40e8d5b3e69,
|
|
0xe7958cb87392c2c2,0xb60b1d1230b20e04,
|
|
0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2,
|
|
0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3,
|
|
0xe2280b6c20dd5232,0x25c6da63c38de1b0,
|
|
0x8d590723948a535f,0x579c487e5a38ad0e,
|
|
0xb0af48ec79ace837,0x2d835a9df0c6d851,
|
|
0xdcdb1b2798182244,0xf8e431456cf88e65,
|
|
0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff,
|
|
0xac8b2d36eed2dac5,0xe272467e3d222f3f,
|
|
0xd7adf884aa879177,0x5b0ed81dcc6abb0f,
|
|
0x86ccbb52ea94baea,0x98e947129fc2b4e9,
|
|
0xa87fea27a539e9a5,0x3f2398d747b36224,
|
|
0xd29fe4b18e88640e,0x8eec7f0d19a03aad,
|
|
0x83a3eeeef9153e89,0x1953cf68300424ac,
|
|
0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7,
|
|
0xcdb02555653131b6,0x3792f412cb06794d,
|
|
0x808e17555f3ebf11,0xe2bbd88bbee40bd0,
|
|
0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4,
|
|
0xc8de047564d20a8b,0xf245825a5a445275,
|
|
0xfb158592be068d2e,0xeed6e2f0f0d56712,
|
|
0x9ced737bb6c4183d,0x55464dd69685606b,
|
|
0xc428d05aa4751e4c,0xaa97e14c3c26b886,
|
|
0xf53304714d9265df,0xd53dd99f4b3066a8,
|
|
0x993fe2c6d07b7fab,0xe546a8038efe4029,
|
|
0xbf8fdb78849a5f96,0xde98520472bdd033,
|
|
0xef73d256a5c0f77c,0x963e66858f6d4440,
|
|
0x95a8637627989aad,0xdde7001379a44aa8,
|
|
0xbb127c53b17ec159,0x5560c018580d5d52,
|
|
0xe9d71b689dde71af,0xaab8f01e6e10b4a6,
|
|
0x9226712162ab070d,0xcab3961304ca70e8,
|
|
0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22,
|
|
0xe45c10c42a2b3b05,0x8cb89a7db77c506a,
|
|
0x8eb98a7a9a5b04e3,0x77f3608e92adb242,
|
|
0xb267ed1940f1c61c,0x55f038b237591ed3,
|
|
0xdf01e85f912e37a3,0x6b6c46dec52f6688,
|
|
0x8b61313bbabce2c6,0x2323ac4b3b3da015,
|
|
0xae397d8aa96c1b77,0xabec975e0a0d081a,
|
|
0xd9c7dced53c72255,0x96e7bd358c904a21,
|
|
0x881cea14545c7575,0x7e50d64177da2e54,
|
|
0xaa242499697392d2,0xdde50bd1d5d0b9e9,
|
|
0xd4ad2dbfc3d07787,0x955e4ec64b44e864,
|
|
0x84ec3c97da624ab4,0xbd5af13bef0b113e,
|
|
0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e,
|
|
0xcfb11ead453994ba,0x67de18eda5814af2,
|
|
0x81ceb32c4b43fcf4,0x80eacf948770ced7,
|
|
0xa2425ff75e14fc31,0xa1258379a94d028d,
|
|
0xcad2f7f5359a3b3e,0x96ee45813a04330,
|
|
0xfd87b5f28300ca0d,0x8bca9d6e188853fc,
|
|
0x9e74d1b791e07e48,0x775ea264cf55347e,
|
|
0xc612062576589dda,0x95364afe032a81a0,
|
|
0xf79687aed3eec551,0x3a83ddbd83f52210,
|
|
0x9abe14cd44753b52,0xc4926a9672793580,
|
|
0xc16d9a0095928a27,0x75b7053c0f178400,
|
|
0xf1c90080baf72cb1,0x5324c68b12dd6800,
|
|
0x971da05074da7bee,0xd3f6fc16ebca8000,
|
|
0xbce5086492111aea,0x88f4bb1ca6bd0000,
|
|
0xec1e4a7db69561a5,0x2b31e9e3d0700000,
|
|
0x9392ee8e921d5d07,0x3aff322e62600000,
|
|
0xb877aa3236a4b449,0x9befeb9fad487c3,
|
|
0xe69594bec44de15b,0x4c2ebe687989a9b4,
|
|
0x901d7cf73ab0acd9,0xf9d37014bf60a11,
|
|
0xb424dc35095cd80f,0x538484c19ef38c95,
|
|
0xe12e13424bb40e13,0x2865a5f206b06fba,
|
|
0x8cbccc096f5088cb,0xf93f87b7442e45d4,
|
|
0xafebff0bcb24aafe,0xf78f69a51539d749,
|
|
0xdbe6fecebdedd5be,0xb573440e5a884d1c,
|
|
0x89705f4136b4a597,0x31680a88f8953031,
|
|
0xabcc77118461cefc,0xfdc20d2b36ba7c3e,
|
|
0xd6bf94d5e57a42bc,0x3d32907604691b4d,
|
|
0x8637bd05af6c69b5,0xa63f9a49c2c1b110,
|
|
0xa7c5ac471b478423,0xfcf80dc33721d54,
|
|
0xd1b71758e219652b,0xd3c36113404ea4a9,
|
|
0x83126e978d4fdf3b,0x645a1cac083126ea,
|
|
0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4,
|
|
0xcccccccccccccccc,0xcccccccccccccccd,
|
|
0x8000000000000000,0x0,
|
|
0xa000000000000000,0x0,
|
|
0xc800000000000000,0x0,
|
|
0xfa00000000000000,0x0,
|
|
0x9c40000000000000,0x0,
|
|
0xc350000000000000,0x0,
|
|
0xf424000000000000,0x0,
|
|
0x9896800000000000,0x0,
|
|
0xbebc200000000000,0x0,
|
|
0xee6b280000000000,0x0,
|
|
0x9502f90000000000,0x0,
|
|
0xba43b74000000000,0x0,
|
|
0xe8d4a51000000000,0x0,
|
|
0x9184e72a00000000,0x0,
|
|
0xb5e620f480000000,0x0,
|
|
0xe35fa931a0000000,0x0,
|
|
0x8e1bc9bf04000000,0x0,
|
|
0xb1a2bc2ec5000000,0x0,
|
|
0xde0b6b3a76400000,0x0,
|
|
0x8ac7230489e80000,0x0,
|
|
0xad78ebc5ac620000,0x0,
|
|
0xd8d726b7177a8000,0x0,
|
|
0x878678326eac9000,0x0,
|
|
0xa968163f0a57b400,0x0,
|
|
0xd3c21bcecceda100,0x0,
|
|
0x84595161401484a0,0x0,
|
|
0xa56fa5b99019a5c8,0x0,
|
|
0xcecb8f27f4200f3a,0x0,
|
|
0x813f3978f8940984,0x4000000000000000,
|
|
0xa18f07d736b90be5,0x5000000000000000,
|
|
0xc9f2c9cd04674ede,0xa400000000000000,
|
|
0xfc6f7c4045812296,0x4d00000000000000,
|
|
0x9dc5ada82b70b59d,0xf020000000000000,
|
|
0xc5371912364ce305,0x6c28000000000000,
|
|
0xf684df56c3e01bc6,0xc732000000000000,
|
|
0x9a130b963a6c115c,0x3c7f400000000000,
|
|
0xc097ce7bc90715b3,0x4b9f100000000000,
|
|
0xf0bdc21abb48db20,0x1e86d40000000000,
|
|
0x96769950b50d88f4,0x1314448000000000,
|
|
0xbc143fa4e250eb31,0x17d955a000000000,
|
|
0xeb194f8e1ae525fd,0x5dcfab0800000000,
|
|
0x92efd1b8d0cf37be,0x5aa1cae500000000,
|
|
0xb7abc627050305ad,0xf14a3d9e40000000,
|
|
0xe596b7b0c643c719,0x6d9ccd05d0000000,
|
|
0x8f7e32ce7bea5c6f,0xe4820023a2000000,
|
|
0xb35dbf821ae4f38b,0xdda2802c8a800000,
|
|
0xe0352f62a19e306e,0xd50b2037ad200000,
|
|
0x8c213d9da502de45,0x4526f422cc340000,
|
|
0xaf298d050e4395d6,0x9670b12b7f410000,
|
|
0xdaf3f04651d47b4c,0x3c0cdd765f114000,
|
|
0x88d8762bf324cd0f,0xa5880a69fb6ac800,
|
|
0xab0e93b6efee0053,0x8eea0d047a457a00,
|
|
0xd5d238a4abe98068,0x72a4904598d6d880,
|
|
0x85a36366eb71f041,0x47a6da2b7f864750,
|
|
0xa70c3c40a64e6c51,0x999090b65f67d924,
|
|
0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d,
|
|
0x82818f1281ed449f,0xbff8f10e7a8921a4,
|
|
0xa321f2d7226895c7,0xaff72d52192b6a0d,
|
|
0xcbea6f8ceb02bb39,0x9bf4f8a69f764490,
|
|
0xfee50b7025c36a08,0x2f236d04753d5b4,
|
|
0x9f4f2726179a2245,0x1d762422c946590,
|
|
0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5,
|
|
0xf8ebad2b84e0d58b,0xd2e0898765a7deb2,
|
|
0x9b934c3b330c8577,0x63cc55f49f88eb2f,
|
|
0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb,
|
|
0xf316271c7fc3908a,0x8bef464e3945ef7a,
|
|
0x97edd871cfda3a56,0x97758bf0e3cbb5ac,
|
|
0xbde94e8e43d0c8ec,0x3d52eeed1cbea317,
|
|
0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd,
|
|
0x945e455f24fb1cf8,0x8fe8caa93e74ef6a,
|
|
0xb975d6b6ee39e436,0xb3e2fd538e122b44,
|
|
0xe7d34c64a9c85d44,0x60dbbca87196b616,
|
|
0x90e40fbeea1d3a4a,0xbc8955e946fe31cd,
|
|
0xb51d13aea4a488dd,0x6babab6398bdbe41,
|
|
0xe264589a4dcdab14,0xc696963c7eed2dd1,
|
|
0x8d7eb76070a08aec,0xfc1e1de5cf543ca2,
|
|
0xb0de65388cc8ada8,0x3b25a55f43294bcb,
|
|
0xdd15fe86affad912,0x49ef0eb713f39ebe,
|
|
0x8a2dbf142dfcc7ab,0x6e3569326c784337,
|
|
0xacb92ed9397bf996,0x49c2c37f07965404,
|
|
0xd7e77a8f87daf7fb,0xdc33745ec97be906,
|
|
0x86f0ac99b4e8dafd,0x69a028bb3ded71a3,
|
|
0xa8acd7c0222311bc,0xc40832ea0d68ce0c,
|
|
0xd2d80db02aabd62b,0xf50a3fa490c30190,
|
|
0x83c7088e1aab65db,0x792667c6da79e0fa,
|
|
0xa4b8cab1a1563f52,0x577001b891185938,
|
|
0xcde6fd5e09abcf26,0xed4c0226b55e6f86,
|
|
0x80b05e5ac60b6178,0x544f8158315b05b4,
|
|
0xa0dc75f1778e39d6,0x696361ae3db1c721,
|
|
0xc913936dd571c84c,0x3bc3a19cd1e38e9,
|
|
0xfb5878494ace3a5f,0x4ab48a04065c723,
|
|
0x9d174b2dcec0e47b,0x62eb0d64283f9c76,
|
|
0xc45d1df942711d9a,0x3ba5d0bd324f8394,
|
|
0xf5746577930d6500,0xca8f44ec7ee36479,
|
|
0x9968bf6abbe85f20,0x7e998b13cf4e1ecb,
|
|
0xbfc2ef456ae276e8,0x9e3fedd8c321a67e,
|
|
0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e,
|
|
0x95d04aee3b80ece5,0xbba1f1d158724a12,
|
|
0xbb445da9ca61281f,0x2a8a6e45ae8edc97,
|
|
0xea1575143cf97226,0xf52d09d71a3293bd,
|
|
0x924d692ca61be758,0x593c2626705f9c56,
|
|
0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c,
|
|
0xe498f455c38b997a,0xb6dfb9c0f956447,
|
|
0x8edf98b59a373fec,0x4724bd4189bd5eac,
|
|
0xb2977ee300c50fe7,0x58edec91ec2cb657,
|
|
0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed,
|
|
0x8b865b215899f46c,0xbd79e0d20082ee74,
|
|
0xae67f1e9aec07187,0xecd8590680a3aa11,
|
|
0xda01ee641a708de9,0xe80e6f4820cc9495,
|
|
0x884134fe908658b2,0x3109058d147fdcdd,
|
|
0xaa51823e34a7eede,0xbd4b46f0599fd415,
|
|
0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a,
|
|
0x850fadc09923329e,0x3e2cf6bc604ddb0,
|
|
0xa6539930bf6bff45,0x84db8346b786151c,
|
|
0xcfe87f7cef46ff16,0xe612641865679a63,
|
|
0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e,
|
|
0xa26da3999aef7749,0xe3be5e330f38f09d,
|
|
0xcb090c8001ab551c,0x5cadf5bfd3072cc5,
|
|
0xfdcb4fa002162a63,0x73d9732fc7c8f7f6,
|
|
0x9e9f11c4014dda7e,0x2867e7fddcdd9afa,
|
|
0xc646d63501a1511d,0xb281e1fd541501b8,
|
|
0xf7d88bc24209a565,0x1f225a7ca91a4226,
|
|
0x9ae757596946075f,0x3375788de9b06958,
|
|
0xc1a12d2fc3978937,0x52d6b1641c83ae,
|
|
0xf209787bb47d6b84,0xc0678c5dbd23a49a,
|
|
0x9745eb4d50ce6332,0xf840b7ba963646e0,
|
|
0xbd176620a501fbff,0xb650e5a93bc3d898,
|
|
0xec5d3fa8ce427aff,0xa3e51f138ab4cebe,
|
|
0x93ba47c980e98cdf,0xc66f336c36b10137,
|
|
0xb8a8d9bbe123f017,0xb80b0047445d4184,
|
|
0xe6d3102ad96cec1d,0xa60dc059157491e5,
|
|
0x9043ea1ac7e41392,0x87c89837ad68db2f,
|
|
0xb454e4a179dd1877,0x29babe4598c311fb,
|
|
0xe16a1dc9d8545e94,0xf4296dd6fef3d67a,
|
|
0x8ce2529e2734bb1d,0x1899e4a65f58660c,
|
|
0xb01ae745b101e9e4,0x5ec05dcff72e7f8f,
|
|
0xdc21a1171d42645d,0x76707543f4fa1f73,
|
|
0x899504ae72497eba,0x6a06494a791c53a8,
|
|
0xabfa45da0edbde69,0x487db9d17636892,
|
|
0xd6f8d7509292d603,0x45a9d2845d3c42b6,
|
|
0x865b86925b9bc5c2,0xb8a2392ba45a9b2,
|
|
0xa7f26836f282b732,0x8e6cac7768d7141e,
|
|
0xd1ef0244af2364ff,0x3207d795430cd926,
|
|
0x8335616aed761f1f,0x7f44e6bd49e807b8,
|
|
0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6,
|
|
0xcd036837130890a1,0x36dba887c37a8c0f,
|
|
0x802221226be55a64,0xc2494954da2c9789,
|
|
0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c,
|
|
0xc83553c5c8965d3d,0x6f92829494e5acc7,
|
|
0xfa42a8b73abbf48c,0xcb772339ba1f17f9,
|
|
0x9c69a97284b578d7,0xff2a760414536efb,
|
|
0xc38413cf25e2d70d,0xfef5138519684aba,
|
|
0xf46518c2ef5b8cd1,0x7eb258665fc25d69,
|
|
0x98bf2f79d5993802,0xef2f773ffbd97a61,
|
|
0xbeeefb584aff8603,0xaafb550ffacfd8fa,
|
|
0xeeaaba2e5dbf6784,0x95ba2a53f983cf38,
|
|
0x952ab45cfa97a0b2,0xdd945a747bf26183,
|
|
0xba756174393d88df,0x94f971119aeef9e4,
|
|
0xe912b9d1478ceb17,0x7a37cd5601aab85d,
|
|
0x91abb422ccb812ee,0xac62e055c10ab33a,
|
|
0xb616a12b7fe617aa,0x577b986b314d6009,
|
|
0xe39c49765fdf9d94,0xed5a7e85fda0b80b,
|
|
0x8e41ade9fbebc27d,0x14588f13be847307,
|
|
0xb1d219647ae6b31c,0x596eb2d8ae258fc8,
|
|
0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb,
|
|
0x8aec23d680043bee,0x25de7bb9480d5854,
|
|
0xada72ccc20054ae9,0xaf561aa79a10ae6a,
|
|
0xd910f7ff28069da4,0x1b2ba1518094da04,
|
|
0x87aa9aff79042286,0x90fb44d2f05d0842,
|
|
0xa99541bf57452b28,0x353a1607ac744a53,
|
|
0xd3fa922f2d1675f2,0x42889b8997915ce8,
|
|
0x847c9b5d7c2e09b7,0x69956135febada11,
|
|
0xa59bc234db398c25,0x43fab9837e699095,
|
|
0xcf02b2c21207ef2e,0x94f967e45e03f4bb,
|
|
0x8161afb94b44f57d,0x1d1be0eebac278f5,
|
|
0xa1ba1ba79e1632dc,0x6462d92a69731732,
|
|
0xca28a291859bbf93,0x7d7b8f7503cfdcfe,
|
|
0xfcb2cb35e702af78,0x5cda735244c3d43e,
|
|
0x9defbf01b061adab,0x3a0888136afa64a7,
|
|
0xc56baec21c7a1916,0x88aaa1845b8fdd0,
|
|
0xf6c69a72a3989f5b,0x8aad549e57273d45,
|
|
0x9a3c2087a63f6399,0x36ac54e2f678864b,
|
|
0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd,
|
|
0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5,
|
|
0x969eb7c47859e743,0x9f644ae5a4b1b325,
|
|
0xbc4665b596706114,0x873d5d9f0dde1fee,
|
|
0xeb57ff22fc0c7959,0xa90cb506d155a7ea,
|
|
0x9316ff75dd87cbd8,0x9a7f12442d588f2,
|
|
0xb7dcbf5354e9bece,0xc11ed6d538aeb2f,
|
|
0xe5d3ef282a242e81,0x8f1668c8a86da5fa,
|
|
0x8fa475791a569d10,0xf96e017d694487bc,
|
|
0xb38d92d760ec4455,0x37c981dcc395a9ac,
|
|
0xe070f78d3927556a,0x85bbe253f47b1417,
|
|
0x8c469ab843b89562,0x93956d7478ccec8e,
|
|
0xaf58416654a6babb,0x387ac8d1970027b2,
|
|
0xdb2e51bfe9d0696a,0x6997b05fcc0319e,
|
|
0x88fcf317f22241e2,0x441fece3bdf81f03,
|
|
0xab3c2fddeeaad25a,0xd527e81cad7626c3,
|
|
0xd60b3bd56a5586f1,0x8a71e223d8d3b074,
|
|
0x85c7056562757456,0xf6872d5667844e49,
|
|
0xa738c6bebb12d16c,0xb428f8ac016561db,
|
|
0xd106f86e69d785c7,0xe13336d701beba52,
|
|
0x82a45b450226b39c,0xecc0024661173473,
|
|
0xa34d721642b06084,0x27f002d7f95d0190,
|
|
0xcc20ce9bd35c78a5,0x31ec038df7b441f4,
|
|
0xff290242c83396ce,0x7e67047175a15271,
|
|
0x9f79a169bd203e41,0xf0062c6e984d386,
|
|
0xc75809c42c684dd1,0x52c07b78a3e60868,
|
|
0xf92e0c3537826145,0xa7709a56ccdf8a82,
|
|
0x9bbcc7a142b17ccb,0x88a66076400bb691,
|
|
0xc2abf989935ddbfe,0x6acff893d00ea435,
|
|
0xf356f7ebf83552fe,0x583f6b8c4124d43,
|
|
0x98165af37b2153de,0xc3727a337a8b704a,
|
|
0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c,
|
|
0xeda2ee1c7064130c,0x1162def06f79df73,
|
|
0x9485d4d1c63e8be7,0x8addcb5645ac2ba8,
|
|
0xb9a74a0637ce2ee1,0x6d953e2bd7173692,
|
|
0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437,
|
|
0x910ab1d4db9914a0,0x1d9c9892400a22a2,
|
|
0xb54d5e4a127f59c8,0x2503beb6d00cab4b,
|
|
0xe2a0b5dc971f303a,0x2e44ae64840fd61d,
|
|
0x8da471a9de737e24,0x5ceaecfed289e5d2,
|
|
0xb10d8e1456105dad,0x7425a83e872c5f47,
|
|
0xdd50f1996b947518,0xd12f124e28f77719,
|
|
0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f,
|
|
0xace73cbfdc0bfb7b,0x636cc64d1001550b,
|
|
0xd8210befd30efa5a,0x3c47f7e05401aa4e,
|
|
0x8714a775e3e95c78,0x65acfaec34810a71,
|
|
0xa8d9d1535ce3b396,0x7f1839a741a14d0d,
|
|
0xd31045a8341ca07c,0x1ede48111209a050,
|
|
0x83ea2b892091e44d,0x934aed0aab460432,
|
|
0xa4e4b66b68b65d60,0xf81da84d5617853f,
|
|
0xce1de40642e3f4b9,0x36251260ab9d668e,
|
|
0x80d2ae83e9ce78f3,0xc1d72b7c6b426019,
|
|
0xa1075a24e4421730,0xb24cf65b8612f81f,
|
|
0xc94930ae1d529cfc,0xdee033f26797b627,
|
|
0xfb9b7cd9a4a7443c,0x169840ef017da3b1,
|
|
0x9d412e0806e88aa5,0x8e1f289560ee864e,
|
|
0xc491798a08a2ad4e,0xf1a6f2bab92a27e2,
|
|
0xf5b5d7ec8acb58a2,0xae10af696774b1db,
|
|
0x9991a6f3d6bf1765,0xacca6da1e0a8ef29,
|
|
0xbff610b0cc6edd3f,0x17fd090a58d32af3,
|
|
0xeff394dcff8a948e,0xddfc4b4cef07f5b0,
|
|
0x95f83d0a1fb69cd9,0x4abdaf101564f98e,
|
|
0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1,
|
|
0xea53df5fd18d5513,0x84c86189216dc5ed,
|
|
0x92746b9be2f8552c,0x32fd3cf5b4e49bb4,
|
|
0xb7118682dbb66a77,0x3fbc8c33221dc2a1,
|
|
0xe4d5e82392a40515,0xfabaf3feaa5334a,
|
|
0x8f05b1163ba6832d,0x29cb4d87f2a7400e,
|
|
0xb2c71d5bca9023f8,0x743e20e9ef511012,
|
|
0xdf78e4b2bd342cf6,0x914da9246b255416,
|
|
0x8bab8eefb6409c1a,0x1ad089b6c2f7548e,
|
|
0xae9672aba3d0c320,0xa184ac2473b529b1,
|
|
0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e,
|
|
0x8865899617fb1871,0x7e2fa67c7a658892,
|
|
0xaa7eebfb9df9de8d,0xddbb901b98feeab7,
|
|
0xd51ea6fa85785631,0x552a74227f3ea565,
|
|
0x8533285c936b35de,0xd53a88958f87275f,
|
|
0xa67ff273b8460356,0x8a892abaf368f137,
|
|
0xd01fef10a657842c,0x2d2b7569b0432d85,
|
|
0x8213f56a67f6b29b,0x9c3b29620e29fc73,
|
|
0xa298f2c501f45f42,0x8349f3ba91b47b8f,
|
|
0xcb3f2f7642717713,0x241c70a936219a73,
|
|
0xfe0efb53d30dd4d7,0xed238cd383aa0110,
|
|
0x9ec95d1463e8a506,0xf4363804324a40aa,
|
|
0xc67bb4597ce2ce48,0xb143c6053edcd0d5,
|
|
0xf81aa16fdc1b81da,0xdd94b7868e94050a,
|
|
0x9b10a4e5e9913128,0xca7cf2b4191c8326,
|
|
0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0,
|
|
0xf24a01a73cf2dccf,0xbc633b39673c8cec,
|
|
0x976e41088617ca01,0xd5be0503e085d813,
|
|
0xbd49d14aa79dbc82,0x4b2d8644d8a74e18,
|
|
0xec9c459d51852ba2,0xddf8e7d60ed1219e,
|
|
0x93e1ab8252f33b45,0xcabb90e5c942b503,
|
|
0xb8da1662e7b00a17,0x3d6a751f3b936243,
|
|
0xe7109bfba19c0c9d,0xcc512670a783ad4,
|
|
0x906a617d450187e2,0x27fb2b80668b24c5,
|
|
0xb484f9dc9641e9da,0xb1f9f660802dedf6,
|
|
0xe1a63853bbd26451,0x5e7873f8a0396973,
|
|
0x8d07e33455637eb2,0xdb0b487b6423e1e8,
|
|
0xb049dc016abc5e5f,0x91ce1a9a3d2cda62,
|
|
0xdc5c5301c56b75f7,0x7641a140cc7810fb,
|
|
0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d,
|
|
0xac2820d9623bf429,0x546345fa9fbdcd44,
|
|
0xd732290fbacaf133,0xa97c177947ad4095,
|
|
0x867f59a9d4bed6c0,0x49ed8eabcccc485d,
|
|
0xa81f301449ee8c70,0x5c68f256bfff5a74,
|
|
0xd226fc195c6a2f8c,0x73832eec6fff3111,
|
|
0x83585d8fd9c25db7,0xc831fd53c5ff7eab,
|
|
0xa42e74f3d032f525,0xba3e7ca8b77f5e55,
|
|
0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb,
|
|
0x80444b5e7aa7cf85,0x7980d163cf5b81b3,
|
|
0xa0555e361951c366,0xd7e105bcc332621f,
|
|
0xc86ab5c39fa63440,0x8dd9472bf3fefaa7,
|
|
0xfa856334878fc150,0xb14f98f6f0feb951,
|
|
0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3,
|
|
0xc3b8358109e84f07,0xa862f80ec4700c8,
|
|
0xf4a642e14c6262c8,0xcd27bb612758c0fa,
|
|
0x98e7e9cccfbd7dbd,0x8038d51cb897789c,
|
|
0xbf21e44003acdd2c,0xe0470a63e6bd56c3,
|
|
0xeeea5d5004981478,0x1858ccfce06cac74,
|
|
0x95527a5202df0ccb,0xf37801e0c43ebc8,
|
|
0xbaa718e68396cffd,0xd30560258f54e6ba,
|
|
0xe950df20247c83fd,0x47c6b82ef32a2069,
|
|
0x91d28b7416cdd27e,0x4cdc331d57fa5441,
|
|
0xb6472e511c81471d,0xe0133fe4adf8e952,
|
|
0xe3d8f9e563a198e5,0x58180fddd97723a6,
|
|
0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,};
|
|
|
|
#endif // SIMDJSON_SRC_NUMBERPARSING_TABLES_CPP
|
|
/* end file internal/numberparsing_tables.cpp */
|
|
/* including internal/simdprune_tables.cpp: #include <internal/simdprune_tables.cpp> */
|
|
/* begin file internal/simdprune_tables.cpp */
|
|
#ifndef SIMDJSON_SRC_SIMDPRUNE_TABLES_CPP
|
|
#define SIMDJSON_SRC_SIMDPRUNE_TABLES_CPP
|
|
|
|
/* including simdjson/implementation_detection.h: #include <simdjson/implementation_detection.h> */
|
|
/* begin file simdjson/implementation_detection.h */
|
|
#ifndef SIMDJSON_IMPLEMENTATION_DETECTION_H
|
|
#define SIMDJSON_IMPLEMENTATION_DETECTION_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
|
|
// 0 is reserved, because undefined SIMDJSON_IMPLEMENTATION equals 0 in preprocessor macros.
|
|
#define SIMDJSON_IMPLEMENTATION_ID_arm64 1
|
|
#define SIMDJSON_IMPLEMENTATION_ID_fallback 2
|
|
#define SIMDJSON_IMPLEMENTATION_ID_haswell 3
|
|
#define SIMDJSON_IMPLEMENTATION_ID_icelake 4
|
|
#define SIMDJSON_IMPLEMENTATION_ID_ppc64 5
|
|
#define SIMDJSON_IMPLEMENTATION_ID_westmere 6
|
|
#define SIMDJSON_IMPLEMENTATION_ID_lsx 7
|
|
#define SIMDJSON_IMPLEMENTATION_ID_lasx 8
|
|
|
|
#define SIMDJSON_IMPLEMENTATION_ID_FOR(IMPL) SIMDJSON_CAT(SIMDJSON_IMPLEMENTATION_ID_, IMPL)
|
|
#define SIMDJSON_IMPLEMENTATION_ID SIMDJSON_IMPLEMENTATION_ID_FOR(SIMDJSON_IMPLEMENTATION)
|
|
|
|
#define SIMDJSON_IMPLEMENTATION_IS(IMPL) SIMDJSON_IMPLEMENTATION_ID == SIMDJSON_IMPLEMENTATION_ID_FOR(IMPL)
|
|
|
|
//
|
|
// First, figure out which implementations can be run. Doing it here makes it so we don't have to worry about the order
|
|
// in which we include them.
|
|
//
|
|
|
|
#ifndef SIMDJSON_IMPLEMENTATION_ARM64
|
|
#define SIMDJSON_IMPLEMENTATION_ARM64 (SIMDJSON_IS_ARM64)
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_ARM64 && SIMDJSON_IS_ARM64
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_ARM64 1
|
|
#else
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_ARM64 0
|
|
#endif
|
|
|
|
// Default Icelake to on if this is x86-64. Even if we're not compiled for it, it could be selected
|
|
// at runtime.
|
|
#ifndef SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
#define SIMDJSON_IMPLEMENTATION_ICELAKE ((SIMDJSON_IS_X86_64) && (SIMDJSON_AVX512_ALLOWED) && (SIMDJSON_COMPILER_SUPPORTS_VBMI2))
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
// To see why (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
|
|
// https://github.com/simdjson/simdjson/issues/1247
|
|
#if ((SIMDJSON_IMPLEMENTATION_ICELAKE) && (__AVX2__) && (__AVX512F__) && (__AVX512DQ__) && (__AVX512CD__) && (__AVX512BW__) && (__AVX512VL__) && (__AVX512VBMI2__))
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 1
|
|
#else
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 0
|
|
#endif
|
|
|
|
#else
|
|
|
|
#if ((SIMDJSON_IMPLEMENTATION_ICELAKE) && (__AVX2__) && (__BMI__) && (__PCLMUL__) && (__LZCNT__) && (__AVX512F__) && (__AVX512DQ__) && (__AVX512CD__) && (__AVX512BW__) && (__AVX512VL__) && (__AVX512VBMI2__))
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 1
|
|
#else
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_ICELAKE 0
|
|
#endif
|
|
|
|
#endif
|
|
|
|
// Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
|
|
// at runtime.
|
|
#ifndef SIMDJSON_IMPLEMENTATION_HASWELL
|
|
#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
|
|
// if icelake is always available, never enable haswell.
|
|
#define SIMDJSON_IMPLEMENTATION_HASWELL 0
|
|
#else
|
|
#define SIMDJSON_IMPLEMENTATION_HASWELL SIMDJSON_IS_X86_64
|
|
#endif
|
|
#endif
|
|
#ifdef _MSC_VER
|
|
// To see why (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
|
|
// https://github.com/simdjson/simdjson/issues/1247
|
|
#if ((SIMDJSON_IMPLEMENTATION_HASWELL) && (SIMDJSON_IS_X86_64) && (__AVX2__))
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 1
|
|
#else
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 0
|
|
#endif
|
|
|
|
#else
|
|
|
|
#if ((SIMDJSON_IMPLEMENTATION_HASWELL) && (SIMDJSON_IS_X86_64) && (__AVX2__) && (__BMI__) && (__PCLMUL__) && (__LZCNT__))
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 1
|
|
#else
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_HASWELL 0
|
|
#endif
|
|
|
|
#endif
|
|
|
|
// Default Westmere to on if this is x86-64.
|
|
#ifndef SIMDJSON_IMPLEMENTATION_WESTMERE
|
|
#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE || SIMDJSON_CAN_ALWAYS_RUN_HASWELL
|
|
// if icelake or haswell are always available, never enable westmere.
|
|
#define SIMDJSON_IMPLEMENTATION_WESTMERE 0
|
|
#else
|
|
#define SIMDJSON_IMPLEMENTATION_WESTMERE SIMDJSON_IS_X86_64
|
|
#endif
|
|
#endif
|
|
|
|
#if (SIMDJSON_IMPLEMENTATION_WESTMERE && SIMDJSON_IS_X86_64 && __SSE4_2__ && __PCLMUL__)
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_WESTMERE 1
|
|
#else
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_WESTMERE 0
|
|
#endif
|
|
|
|
|
|
#ifndef SIMDJSON_IMPLEMENTATION_PPC64
|
|
#define SIMDJSON_IMPLEMENTATION_PPC64 (SIMDJSON_IS_PPC64 && SIMDJSON_IS_PPC64_VMX)
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_PPC64 && SIMDJSON_IS_PPC64 && SIMDJSON_IS_PPC64_VMX
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_PPC64 1
|
|
#else
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_PPC64 0
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_IMPLEMENTATION_LASX
|
|
#define SIMDJSON_IMPLEMENTATION_LASX (SIMDJSON_IS_LOONGARCH64 && __loongarch_asx)
|
|
#endif
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_LASX (SIMDJSON_IMPLEMENTATION_LASX)
|
|
|
|
#ifndef SIMDJSON_IMPLEMENTATION_LSX
|
|
#if SIMDJSON_CAN_ALWAYS_RUN_LASX
|
|
#define SIMDJSON_IMPLEMENTATION_LSX 0
|
|
#else
|
|
#define SIMDJSON_IMPLEMENTATION_LSX (SIMDJSON_IS_LOONGARCH64 && __loongarch_sx)
|
|
#endif
|
|
#endif
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_LSX (SIMDJSON_IMPLEMENTATION_LSX)
|
|
|
|
// Default Fallback to on unless a builtin implementation has already been selected.
|
|
#ifndef SIMDJSON_IMPLEMENTATION_FALLBACK
|
|
#if SIMDJSON_CAN_ALWAYS_RUN_ARM64 || SIMDJSON_CAN_ALWAYS_RUN_ICELAKE || SIMDJSON_CAN_ALWAYS_RUN_HASWELL || SIMDJSON_CAN_ALWAYS_RUN_WESTMERE || SIMDJSON_CAN_ALWAYS_RUN_PPC64 || SIMDJSON_CAN_ALWAYS_RUN_LSX || SIMDJSON_CAN_ALWAYS_RUN_LASX
|
|
// if anything at all except fallback can always run, then disable fallback.
|
|
#define SIMDJSON_IMPLEMENTATION_FALLBACK 0
|
|
#else
|
|
#define SIMDJSON_IMPLEMENTATION_FALLBACK 1
|
|
#endif
|
|
#endif
|
|
#define SIMDJSON_CAN_ALWAYS_RUN_FALLBACK SIMDJSON_IMPLEMENTATION_FALLBACK
|
|
|
|
// Determine the best builtin implementation
|
|
#ifndef SIMDJSON_BUILTIN_IMPLEMENTATION
|
|
|
|
#if SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION icelake
|
|
#elif SIMDJSON_CAN_ALWAYS_RUN_HASWELL
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION haswell
|
|
#elif SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION westmere
|
|
#elif SIMDJSON_CAN_ALWAYS_RUN_ARM64
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION arm64
|
|
#elif SIMDJSON_CAN_ALWAYS_RUN_PPC64
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION ppc64
|
|
#elif SIMDJSON_CAN_ALWAYS_RUN_LSX
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION lsx
|
|
#elif SIMDJSON_CAN_ALWAYS_RUN_LASX
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION lasx
|
|
#elif SIMDJSON_CAN_ALWAYS_RUN_FALLBACK
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION fallback
|
|
#else
|
|
#error "All possible implementations (including fallback) have been disabled! simdjson will not run."
|
|
#endif
|
|
|
|
#endif // SIMDJSON_BUILTIN_IMPLEMENTATION
|
|
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION_ID SIMDJSON_IMPLEMENTATION_ID_FOR(SIMDJSON_BUILTIN_IMPLEMENTATION)
|
|
#define SIMDJSON_BUILTIN_IMPLEMENTATION_IS(IMPL) SIMDJSON_BUILTIN_IMPLEMENTATION_ID == SIMDJSON_IMPLEMENTATION_ID_FOR(IMPL)
|
|
|
|
#endif // SIMDJSON_IMPLEMENTATION_DETECTION_H
|
|
/* end file simdjson/implementation_detection.h */
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_ICELAKE || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64 || SIMDJSON_IMPLEMENTATION_LSX || SIMDJSON_IMPLEMENTATION_LASX
|
|
|
|
#include <cstdint>
|
|
|
|
namespace simdjson { // table modified and copied from
|
|
namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
|
|
SIMDJSON_DLLIMPORTEXPORT const unsigned char BitsSetTable256mul2[256] = {
|
|
0, 2, 2, 4, 2, 4, 4, 6, 2, 4, 4, 6, 4, 6, 6, 8, 2, 4, 4,
|
|
6, 4, 6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 2, 4, 4, 6, 4, 6,
|
|
6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6,
|
|
8, 8, 10, 8, 10, 10, 12, 2, 4, 4, 6, 4, 6, 6, 8, 4, 6, 6, 8,
|
|
6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10,
|
|
12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6, 8,
|
|
8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 2, 4, 4, 6, 4,
|
|
6, 6, 8, 4, 6, 6, 8, 6, 8, 8, 10, 4, 6, 6, 8, 6, 8, 8, 10,
|
|
6, 8, 8, 10, 8, 10, 10, 12, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8,
|
|
10, 8, 10, 10, 12, 6, 8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12,
|
|
12, 14, 4, 6, 6, 8, 6, 8, 8, 10, 6, 8, 8, 10, 8, 10, 10, 12, 6,
|
|
8, 8, 10, 8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 6, 8, 8, 10,
|
|
8, 10, 10, 12, 8, 10, 10, 12, 10, 12, 12, 14, 8, 10, 10, 12, 10, 12, 12,
|
|
14, 10, 12, 12, 14, 12, 14, 14, 16};
|
|
|
|
SIMDJSON_DLLIMPORTEXPORT const uint8_t pshufb_combine_table[272] = {
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
|
|
0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
|
|
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
|
|
0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
|
|
0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
|
|
0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
|
|
0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
|
|
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
|
|
0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
|
|
0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
|
|
0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
|
};
|
|
|
|
// 256 * 8 bytes = 2kB, easily fits in cache.
|
|
SIMDJSON_DLLIMPORTEXPORT const uint64_t thintable_epi8[256] = {
|
|
0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
|
|
0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
|
|
0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
|
|
0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
|
|
0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
|
|
0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
|
|
0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
|
|
0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
|
|
0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
|
|
0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
|
|
0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
|
|
0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
|
|
0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
|
|
0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
|
|
0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
|
|
0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
|
|
0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
|
|
0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
|
|
0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
|
|
0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
|
|
0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
|
|
0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
|
|
0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
|
|
0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
|
|
0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
|
|
0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
|
|
0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
|
|
0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
|
|
0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
|
|
0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
|
|
0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
|
|
0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
|
|
0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
|
|
0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
|
|
0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
|
|
0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
|
|
0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
|
|
0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
|
|
0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
|
|
0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
|
|
0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
|
|
0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
|
|
0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
|
|
0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
|
|
0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
|
|
0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
|
|
0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
|
|
0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
|
|
0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
|
|
0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
|
|
0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
|
|
0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
|
|
0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
|
|
0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
|
|
0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
|
|
0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
|
|
0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
|
|
0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
|
|
0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
|
|
0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
|
|
0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
|
|
0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
|
|
0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
|
|
0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
|
|
0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
|
|
0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
|
|
0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
|
|
0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
|
|
0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
|
|
0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
|
|
0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
|
|
0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
|
|
0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
|
|
0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
|
|
0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
|
|
0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
|
|
0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
|
|
0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
|
|
0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
|
|
0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
|
|
0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
|
|
0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
|
|
0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
|
|
0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
|
|
0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
|
|
0x0000000000000000,
|
|
}; //static uint64_t thintable_epi8[256]
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_IMPLEMENTATION_ARM64 || SIMDJSON_IMPLEMENTATION_ICELAKE || SIMDJSON_IMPLEMENTATION_HASWELL || SIMDJSON_IMPLEMENTATION_WESTMERE || SIMDJSON_IMPLEMENTATION_PPC64 || SIMDJSON_IMPLEMENTATION_LSX || SIMDJSON_IMPLEMENTATION_LASX
|
|
|
|
#endif // SIMDJSON_SRC_SIMDPRUNE_TABLES_CPP
|
|
/* end file internal/simdprune_tables.cpp */
|
|
|
|
/* including simdjson/generic/dependencies.h: #include <simdjson/generic/dependencies.h> */
|
|
/* begin file simdjson/generic/dependencies.h */
|
|
#ifdef SIMDJSON_CONDITIONAL_INCLUDE
|
|
#error simdjson/generic/dependencies.h must be included before defining SIMDJSON_CONDITIONAL_INCLUDE!
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_GENERIC_DEPENDENCIES_H
|
|
#define SIMDJSON_GENERIC_DEPENDENCIES_H
|
|
|
|
// Internal headers needed for generics.
|
|
// All includes referencing simdjson headers *not* under simdjson/generic must be here!
|
|
// Otherwise, amalgamation will fail.
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
/* including simdjson/implementation.h: #include "simdjson/implementation.h" */
|
|
/* begin file simdjson/implementation.h */
|
|
#ifndef SIMDJSON_IMPLEMENTATION_H
|
|
#define SIMDJSON_IMPLEMENTATION_H
|
|
|
|
/* including simdjson/internal/atomic_ptr.h: #include "simdjson/internal/atomic_ptr.h" */
|
|
/* begin file simdjson/internal/atomic_ptr.h */
|
|
#ifndef SIMDJSON_INTERNAL_ATOMIC_PTR_H
|
|
#define SIMDJSON_INTERNAL_ATOMIC_PTR_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
#include <atomic>
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
template<typename T>
|
|
class atomic_ptr {
|
|
public:
|
|
atomic_ptr(T *_ptr) : ptr{_ptr} {}
|
|
|
|
operator const T*() const { return ptr.load(); }
|
|
const T& operator*() const { return *ptr; }
|
|
const T* operator->() const { return ptr.load(); }
|
|
|
|
operator T*() { return ptr.load(); }
|
|
T& operator*() { return *ptr; }
|
|
T* operator->() { return ptr.load(); }
|
|
atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
|
|
|
|
private:
|
|
std::atomic<T*> ptr;
|
|
};
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_ATOMIC_PTR_H
|
|
/* end file simdjson/internal/atomic_ptr.h */
|
|
/* including simdjson/internal/dom_parser_implementation.h: #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* begin file simdjson/internal/dom_parser_implementation.h */
|
|
#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
|
|
#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
/* skipped duplicate #include "simdjson/error.h" */
|
|
#include <memory>
|
|
|
|
namespace simdjson {
|
|
|
|
namespace dom {
|
|
class document;
|
|
} // namespace dom
|
|
|
|
/**
|
|
* This enum is used with the dom_parser_implementation::stage1 function.
|
|
* 1) The regular mode expects a fully formed JSON document.
|
|
* 2) The streaming_partial mode expects a possibly truncated
|
|
* input within a stream on JSON documents.
|
|
* 3) The stream_final mode allows us to truncate final
|
|
* unterminated strings. It is useful in conjunction with streaming_partial.
|
|
*/
|
|
enum class stage1_mode { regular, streaming_partial, streaming_final};
|
|
|
|
/**
|
|
* Returns true if mode == streaming_partial or mode == streaming_final
|
|
*/
|
|
inline bool is_streaming(stage1_mode mode) {
|
|
// performance note: it is probably faster to check that mode is different
|
|
// from regular than checking that it is either streaming_partial or streaming_final.
|
|
return (mode != stage1_mode::regular);
|
|
// return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final);
|
|
}
|
|
|
|
|
|
namespace internal {
|
|
|
|
|
|
/**
|
|
* An implementation of simdjson's DOM parser for a particular CPU architecture.
|
|
*
|
|
* This class is expected to be accessed only by pointer, and never move in memory (though the
|
|
* pointer can move).
|
|
*/
|
|
class dom_parser_implementation {
|
|
public:
|
|
|
|
/**
|
|
* @private For internal implementation use
|
|
*
|
|
* Run a full JSON parse on a single document (stage1 + stage2).
|
|
*
|
|
* Guaranteed only to be called when capacity > document length.
|
|
*
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
|
|
* @param len The length of the json document.
|
|
* @return The error code, or SUCCESS if there was no error.
|
|
*/
|
|
simdjson_warn_unused virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0;
|
|
|
|
/**
|
|
* @private For internal implementation use
|
|
*
|
|
* Stage 1 of the document parser.
|
|
*
|
|
* Guaranteed only to be called when capacity > document length.
|
|
*
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param buf The json document to parse.
|
|
* @param len The length of the json document.
|
|
* @param streaming Whether this is being called by parser::parse_many.
|
|
* @return The error code, or SUCCESS if there was no error.
|
|
*/
|
|
simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0;
|
|
|
|
/**
|
|
* @private For internal implementation use
|
|
*
|
|
* Stage 2 of the document parser.
|
|
*
|
|
* Called after stage1().
|
|
*
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param doc The document to output to.
|
|
* @return The error code, or SUCCESS if there was no error.
|
|
*/
|
|
simdjson_warn_unused virtual error_code stage2(dom::document &doc) noexcept = 0;
|
|
|
|
/**
|
|
* @private For internal implementation use
|
|
*
|
|
* Stage 2 of the document parser for parser::parse_many.
|
|
*
|
|
* Guaranteed only to be called after stage1().
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param doc The document to output to.
|
|
* @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed.
|
|
*/
|
|
simdjson_warn_unused virtual error_code stage2_next(dom::document &doc) noexcept = 0;
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param str pointer to the beginning of a valid UTF-8 JSON string, must end with an unescaped quote.
|
|
* @param dst pointer to a destination buffer, it must point a region in memory of sufficient size.
|
|
* @param allow_replacement whether we allow a replacement character when the UTF-8 contains unmatched surrogate pairs.
|
|
* @return end of the of the written region (exclusive) or nullptr in case of error.
|
|
*/
|
|
simdjson_warn_unused virtual uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept = 0;
|
|
|
|
/**
|
|
* Unescape a NON-valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param str pointer to the beginning of a possibly invalid UTF-8 JSON string, must end with an unescaped quote.
|
|
* @param dst pointer to a destination buffer, it must point a region in memory of sufficient size.
|
|
* @return end of the of the written region (exclusive) or nullptr in case of error.
|
|
*/
|
|
simdjson_warn_unused virtual uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept = 0;
|
|
|
|
/**
|
|
* Change the capacity of this parser.
|
|
*
|
|
* The capacity can never exceed SIMDJSON_MAXSIZE_BYTES (e.g., 4 GB)
|
|
* and an CAPACITY error is returned if it is attempted.
|
|
*
|
|
* Generally used for reallocation.
|
|
*
|
|
* @param capacity The new capacity.
|
|
* @param max_depth The new max_depth.
|
|
* @return The error code, or SUCCESS if there was no error.
|
|
*/
|
|
virtual error_code set_capacity(size_t capacity) noexcept = 0;
|
|
|
|
/**
|
|
* Change the max depth of this parser.
|
|
*
|
|
* Generally used for reallocation.
|
|
*
|
|
* @param capacity The new capacity.
|
|
* @param max_depth The new max_depth.
|
|
* @return The error code, or SUCCESS if there was no error.
|
|
*/
|
|
virtual error_code set_max_depth(size_t max_depth) noexcept = 0;
|
|
|
|
/**
|
|
* Deallocate this parser.
|
|
*/
|
|
virtual ~dom_parser_implementation() = default;
|
|
|
|
/** Number of structural indices passed from stage 1 to stage 2 */
|
|
uint32_t n_structural_indexes{0};
|
|
/** Structural indices passed from stage 1 to stage 2 */
|
|
std::unique_ptr<uint32_t[]> structural_indexes{};
|
|
/** Next structural index to parse */
|
|
uint32_t next_structural_index{0};
|
|
|
|
/**
|
|
* The largest document this parser can support without reallocating.
|
|
*
|
|
* @return Current capacity, in bytes.
|
|
*/
|
|
simdjson_inline size_t capacity() const noexcept;
|
|
|
|
/**
|
|
* The maximum level of nested object and arrays supported by this parser.
|
|
*
|
|
* @return Maximum depth, in bytes.
|
|
*/
|
|
simdjson_inline size_t max_depth() const noexcept;
|
|
|
|
/**
|
|
* Ensure this parser has enough memory to process JSON documents up to `capacity` bytes in length
|
|
* and `max_depth` depth.
|
|
*
|
|
* @param capacity The new capacity.
|
|
* @param max_depth The new max_depth. Defaults to DEFAULT_MAX_DEPTH.
|
|
* @return The error, if there is one.
|
|
*/
|
|
simdjson_warn_unused inline error_code allocate(size_t capacity, size_t max_depth) noexcept;
|
|
|
|
|
|
protected:
|
|
/**
|
|
* The maximum document length this parser supports.
|
|
*
|
|
* Buffers are large enough to handle any document up to this length.
|
|
*/
|
|
size_t _capacity{0};
|
|
|
|
/**
|
|
* The maximum depth (number of nested objects and arrays) supported by this parser.
|
|
*
|
|
* Defaults to DEFAULT_MAX_DEPTH.
|
|
*/
|
|
size_t _max_depth{0};
|
|
|
|
// Declaring these so that subclasses can use them to implement their constructors.
|
|
simdjson_inline dom_parser_implementation() noexcept;
|
|
simdjson_inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
simdjson_inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
|
|
simdjson_inline dom_parser_implementation(const dom_parser_implementation &) noexcept = delete;
|
|
simdjson_inline dom_parser_implementation &operator=(const dom_parser_implementation &other) noexcept = delete;
|
|
}; // class dom_parser_implementation
|
|
|
|
simdjson_inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
simdjson_inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
simdjson_inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
simdjson_inline size_t dom_parser_implementation::capacity() const noexcept {
|
|
return _capacity;
|
|
}
|
|
|
|
simdjson_inline size_t dom_parser_implementation::max_depth() const noexcept {
|
|
return _max_depth;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
inline error_code dom_parser_implementation::allocate(size_t capacity, size_t max_depth) noexcept {
|
|
if (this->max_depth() != max_depth) {
|
|
error_code err = set_max_depth(max_depth);
|
|
if (err) { return err; }
|
|
}
|
|
if (_capacity != capacity) {
|
|
error_code err = set_capacity(capacity);
|
|
if (err) { return err; }
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/internal/dom_parser_implementation.h */
|
|
|
|
#include <memory>
|
|
|
|
namespace simdjson {
|
|
|
|
/**
|
|
* Validate the UTF-8 string.
|
|
*
|
|
* @param buf the string to validate.
|
|
* @param len the length of the string in bytes.
|
|
* @return true if the string is valid UTF-8.
|
|
*/
|
|
simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) noexcept;
|
|
/**
|
|
* Validate the UTF-8 string.
|
|
*
|
|
* @param sv the string_view to validate.
|
|
* @return true if the string is valid UTF-8.
|
|
*/
|
|
simdjson_inline simdjson_warn_unused bool validate_utf8(const std::string_view sv) noexcept {
|
|
return validate_utf8(sv.data(), sv.size());
|
|
}
|
|
|
|
/**
|
|
* Validate the UTF-8 string.
|
|
*
|
|
* @param p the string to validate.
|
|
* @return true if the string is valid UTF-8.
|
|
*/
|
|
simdjson_inline simdjson_warn_unused bool validate_utf8(const std::string& s) noexcept {
|
|
return validate_utf8(s.data(), s.size());
|
|
}
|
|
|
|
/**
|
|
* An implementation of simdjson for a particular CPU architecture.
|
|
*
|
|
* Also used to maintain the currently active implementation. The active implementation is
|
|
* automatically initialized on first use to the most advanced implementation supported by the host.
|
|
*/
|
|
class implementation {
|
|
public:
|
|
|
|
/**
|
|
* The name of this implementation.
|
|
*
|
|
* const implementation *impl = simdjson::get_active_implementation();
|
|
* cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
|
|
*
|
|
* @return the name of the implementation, e.g. "haswell", "westmere", "arm64".
|
|
*/
|
|
virtual std::string name() const { return std::string(_name); }
|
|
|
|
/**
|
|
* The description of this implementation.
|
|
*
|
|
* const implementation *impl = simdjson::get_active_implementation();
|
|
* cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
|
|
*
|
|
* @return the description of the implementation, e.g. "Intel/AMD AVX2", "Intel/AMD SSE4.2", "ARM NEON".
|
|
*/
|
|
virtual std::string description() const { return std::string(_description); }
|
|
|
|
/**
|
|
* The instruction sets this implementation is compiled against
|
|
* and the current CPU match. This function may poll the current CPU/system
|
|
* and should therefore not be called too often if performance is a concern.
|
|
*
|
|
* @return true if the implementation can be safely used on the current system (determined at runtime).
|
|
*/
|
|
bool supported_by_runtime_system() const;
|
|
|
|
/**
|
|
* @private For internal implementation use
|
|
*
|
|
* The instruction sets this implementation is compiled against.
|
|
*
|
|
* @return a mask of all required `internal::instruction_set::` values.
|
|
*/
|
|
virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; }
|
|
|
|
/**
|
|
* @private For internal implementation use
|
|
*
|
|
* const implementation *impl = simdjson::get_active_implementation();
|
|
* cout << "simdjson is optimized for " << impl->name() << "(" << impl->description() << ")" << endl;
|
|
*
|
|
* @param capacity The largest document that will be passed to the parser.
|
|
* @param max_depth The maximum JSON object/array nesting this parser is expected to handle.
|
|
* @param dst The place to put the resulting parser implementation.
|
|
* @return the error code, or SUCCESS if there was no error.
|
|
*/
|
|
virtual error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation> &dst
|
|
) const noexcept = 0;
|
|
|
|
/**
|
|
* @private For internal implementation use
|
|
*
|
|
* Minify the input string assuming that it represents a JSON string, does not parse or validate.
|
|
*
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param buf the json document to minify.
|
|
* @param len the length of the json document.
|
|
* @param dst the buffer to write the minified document to. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
|
|
* @param dst_len the number of bytes written. Output only.
|
|
* @return the error code, or SUCCESS if there was no error.
|
|
*/
|
|
simdjson_warn_unused virtual error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept = 0;
|
|
|
|
|
|
/**
|
|
* Validate the UTF-8 string.
|
|
*
|
|
* Overridden by each implementation.
|
|
*
|
|
* @param buf the string to validate.
|
|
* @param len the length of the string in bytes.
|
|
* @return true if and only if the string is valid UTF-8.
|
|
*/
|
|
simdjson_warn_unused virtual bool validate_utf8(const char *buf, size_t len) const noexcept = 0;
|
|
|
|
protected:
|
|
/** @private Construct an implementation with the given name and description. For subclasses. */
|
|
simdjson_inline implementation(
|
|
std::string_view name,
|
|
std::string_view description,
|
|
uint32_t required_instruction_sets
|
|
) :
|
|
_name(name),
|
|
_description(description),
|
|
_required_instruction_sets(required_instruction_sets)
|
|
{
|
|
}
|
|
protected:
|
|
~implementation() = default;
|
|
|
|
private:
|
|
/**
|
|
* The name of this implementation.
|
|
*/
|
|
std::string_view _name;
|
|
|
|
/**
|
|
* The description of this implementation.
|
|
*/
|
|
std::string_view _description;
|
|
|
|
/**
|
|
* Instruction sets required for this implementation.
|
|
*/
|
|
const uint32_t _required_instruction_sets;
|
|
};
|
|
|
|
/** @private */
|
|
namespace internal {
|
|
|
|
/**
|
|
* The list of available implementations compiled into simdjson.
|
|
*/
|
|
class available_implementation_list {
|
|
public:
|
|
/** Get the list of available implementations compiled into simdjson */
|
|
simdjson_inline available_implementation_list() {}
|
|
/** Number of implementations */
|
|
size_t size() const noexcept;
|
|
/** STL const begin() iterator */
|
|
const implementation * const *begin() const noexcept;
|
|
/** STL const end() iterator */
|
|
const implementation * const *end() const noexcept;
|
|
|
|
/**
|
|
* Get the implementation with the given name.
|
|
*
|
|
* Case sensitive.
|
|
*
|
|
* const implementation *impl = simdjson::get_available_implementations()["westmere"];
|
|
* if (!impl) { exit(1); }
|
|
* if (!imp->supported_by_runtime_system()) { exit(1); }
|
|
* simdjson::get_active_implementation() = impl;
|
|
*
|
|
* @param name the implementation to find, e.g. "westmere", "haswell", "arm64"
|
|
* @return the implementation, or nullptr if the parse failed.
|
|
*/
|
|
const implementation * operator[](const std::string_view &name) const noexcept {
|
|
for (const implementation * impl : *this) {
|
|
if (impl->name() == name) { return impl; }
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
/**
|
|
* Detect the most advanced implementation supported by the current host.
|
|
*
|
|
* This is used to initialize the implementation on startup.
|
|
*
|
|
* const implementation *impl = simdjson::available_implementation::detect_best_supported();
|
|
* simdjson::get_active_implementation() = impl;
|
|
*
|
|
* @return the most advanced supported implementation for the current host, or an
|
|
* implementation that returns UNSUPPORTED_ARCHITECTURE if there is no supported
|
|
* implementation. Will never return nullptr.
|
|
*/
|
|
const implementation *detect_best_supported() const noexcept;
|
|
};
|
|
|
|
} // namespace internal
|
|
|
|
/**
|
|
* The list of available implementations compiled into simdjson.
|
|
*/
|
|
extern SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations();
|
|
|
|
/**
|
|
* The active implementation.
|
|
*
|
|
* Automatically initialized on first use to the most advanced implementation supported by this hardware.
|
|
*/
|
|
extern SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation();
|
|
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_IMPLEMENTATION_H
|
|
/* end file simdjson/implementation.h */
|
|
/* skipped duplicate #include "simdjson/implementation_detection.h" */
|
|
/* including simdjson/internal/instruction_set.h: #include "simdjson/internal/instruction_set.h" */
|
|
/* begin file simdjson/internal/instruction_set.h */
|
|
/* From
|
|
https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
|
|
Highly modified.
|
|
|
|
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
|
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
|
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
|
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
|
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
|
Copyright (c) 2011-2013 NYU (Clement Farabet)
|
|
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
|
|
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
|
|
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
|
|
Samy Bengio, Johnny Mariethoz)
|
|
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
|
|
America and IDIAP Research Institute nor the names of its contributors may be
|
|
used to endorse or promote products derived from this software without
|
|
specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef SIMDJSON_INTERNAL_INSTRUCTION_SET_H
|
|
#define SIMDJSON_INTERNAL_INSTRUCTION_SET_H
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
enum instruction_set {
|
|
DEFAULT = 0x0,
|
|
NEON = 0x1,
|
|
AVX2 = 0x4,
|
|
SSE42 = 0x8,
|
|
PCLMULQDQ = 0x10,
|
|
BMI1 = 0x20,
|
|
BMI2 = 0x40,
|
|
ALTIVEC = 0x80,
|
|
AVX512F = 0x100,
|
|
AVX512DQ = 0x200,
|
|
AVX512IFMA = 0x400,
|
|
AVX512PF = 0x800,
|
|
AVX512ER = 0x1000,
|
|
AVX512CD = 0x2000,
|
|
AVX512BW = 0x4000,
|
|
AVX512VL = 0x8000,
|
|
AVX512VBMI2 = 0x10000,
|
|
LSX = 0x20000,
|
|
LASX = 0x40000,
|
|
};
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_INSTRUCTION_SET_H
|
|
/* end file simdjson/internal/instruction_set.h */
|
|
/* skipped duplicate #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* skipped duplicate #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* skipped duplicate #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* including simdjson/internal/simdprune_tables.h: #include "simdjson/internal/simdprune_tables.h" */
|
|
/* begin file simdjson/internal/simdprune_tables.h */
|
|
#ifndef SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H
|
|
#define SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
|
|
#include <cstdint>
|
|
|
|
namespace simdjson { // table modified and copied from
|
|
namespace internal { // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable
|
|
|
|
extern SIMDJSON_DLLIMPORTEXPORT const unsigned char BitsSetTable256mul2[256];
|
|
|
|
extern SIMDJSON_DLLIMPORTEXPORT const uint8_t pshufb_combine_table[272];
|
|
|
|
// 256 * 8 bytes = 2kB, easily fits in cache.
|
|
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t thintable_epi8[256];
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_SIMDPRUNE_TABLES_H
|
|
/* end file simdjson/internal/simdprune_tables.h */
|
|
|
|
#endif // SIMDJSON_GENERIC_DEPENDENCIES_H
|
|
/* end file simdjson/generic/dependencies.h */
|
|
/* including generic/dependencies.h: #include <generic/dependencies.h> */
|
|
/* begin file generic/dependencies.h */
|
|
#ifdef SIMDJSON_CONDITIONAL_INCLUDE
|
|
#error generic/dependencies.h must be included before defining SIMDJSON_CONDITIONAL_INCLUDE!
|
|
#endif
|
|
|
|
#ifndef SIMDJSON_SRC_GENERIC_DEPENDENCIES_H
|
|
#define SIMDJSON_SRC_GENERIC_DEPENDENCIES_H
|
|
|
|
/* skipped duplicate #include <base.h> */
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DEPENDENCIES_H
|
|
/* end file generic/dependencies.h */
|
|
/* including generic/stage1/dependencies.h: #include <generic/stage1/dependencies.h> */
|
|
/* begin file generic/stage1/dependencies.h */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_DEPENDENCIES_H
|
|
#define SIMDJSON_SRC_GENERIC_STAGE1_DEPENDENCIES_H
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_DEPENDENCIES_H
|
|
/* end file generic/stage1/dependencies.h */
|
|
/* including generic/stage2/dependencies.h: #include <generic/stage2/dependencies.h> */
|
|
/* begin file generic/stage2/dependencies.h */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_DEPENDENCIES_H
|
|
#define SIMDJSON_SRC_GENERIC_STAGE2_DEPENDENCIES_H
|
|
|
|
/* including simdjson/dom/document.h: #include <simdjson/dom/document.h> */
|
|
/* begin file simdjson/dom/document.h */
|
|
#ifndef SIMDJSON_DOM_DOCUMENT_H
|
|
#define SIMDJSON_DOM_DOCUMENT_H
|
|
|
|
/* including simdjson/dom/base.h: #include "simdjson/dom/base.h" */
|
|
/* begin file simdjson/dom/base.h */
|
|
#ifndef SIMDJSON_DOM_BASE_H
|
|
#define SIMDJSON_DOM_BASE_H
|
|
|
|
/* skipped duplicate #include "simdjson/base.h" */
|
|
|
|
namespace simdjson {
|
|
|
|
/**
|
|
* @brief A DOM API on top of the simdjson parser.
|
|
*/
|
|
namespace dom {
|
|
|
|
/** The default batch size for parser.parse_many() and parser.load_many() */
|
|
static constexpr size_t DEFAULT_BATCH_SIZE = 1000000;
|
|
/**
|
|
* Some adversary might try to set the batch size to 0 or 1, which might cause problems.
|
|
* We set a minimum of 32B since anything else is highly likely to be an error. In practice,
|
|
* most users will want a much larger batch size.
|
|
*
|
|
* All non-negative MINIMAL_BATCH_SIZE values should be 'safe' except that, obviously, no JSON
|
|
* document can ever span 0 or 1 byte and that very large values would create memory allocation issues.
|
|
*/
|
|
static constexpr size_t MINIMAL_BATCH_SIZE = 32;
|
|
|
|
/**
|
|
* It is wasteful to allocate memory for tiny documents (e.g., 4 bytes).
|
|
*/
|
|
static constexpr size_t MINIMAL_DOCUMENT_CAPACITY = 32;
|
|
|
|
class array;
|
|
class document;
|
|
class document_stream;
|
|
class element;
|
|
class key_value_pair;
|
|
class object;
|
|
class parser;
|
|
|
|
#ifdef SIMDJSON_THREADS_ENABLED
|
|
struct stage1_worker;
|
|
#endif // SIMDJSON_THREADS_ENABLED
|
|
|
|
} // namespace dom
|
|
|
|
namespace internal {
|
|
|
|
template<typename T>
|
|
class string_builder;
|
|
class tape_ref;
|
|
|
|
} // namespace internal
|
|
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_DOM_BASE_H
|
|
/* end file simdjson/dom/base.h */
|
|
|
|
#include <memory>
|
|
|
|
namespace simdjson {
|
|
namespace dom {
|
|
|
|
/**
|
|
* A parsed JSON document.
|
|
*
|
|
* This class cannot be copied, only moved, to avoid unintended allocations.
|
|
*/
|
|
class document {
|
|
public:
|
|
/**
|
|
* Create a document container with zero capacity.
|
|
*
|
|
* The parser will allocate capacity as needed.
|
|
*/
|
|
document() noexcept = default;
|
|
~document() noexcept = default;
|
|
|
|
/**
|
|
* Take another document's buffers.
|
|
*
|
|
* @param other The document to take. Its capacity is zeroed and it is invalidated.
|
|
*/
|
|
document(document &&other) noexcept = default;
|
|
/** @private */
|
|
document(const document &) = delete; // Disallow copying
|
|
/**
|
|
* Take another document's buffers.
|
|
*
|
|
* @param other The document to take. Its capacity is zeroed.
|
|
*/
|
|
document &operator=(document &&other) noexcept = default;
|
|
/** @private */
|
|
document &operator=(const document &) = delete; // Disallow copying
|
|
|
|
/**
|
|
* Get the root element of this document as a JSON array.
|
|
*/
|
|
element root() const noexcept;
|
|
|
|
/**
|
|
* @private Dump the raw tape for debugging.
|
|
*
|
|
* @param os the stream to output to.
|
|
* @return false if the tape is likely wrong (e.g., you did not parse a valid JSON).
|
|
*/
|
|
bool dump_raw_tape(std::ostream &os) const noexcept;
|
|
|
|
/** @private Structural values. */
|
|
std::unique_ptr<uint64_t[]> tape{};
|
|
|
|
/** @private String values.
|
|
*
|
|
* Should be at least byte_capacity.
|
|
*/
|
|
std::unique_ptr<uint8_t[]> string_buf{};
|
|
/** @private Allocate memory to support
|
|
* input JSON documents of up to len bytes.
|
|
*
|
|
* When calling this function, you lose
|
|
* all the data.
|
|
*
|
|
* The memory allocation is strict: you
|
|
* can you use this function to increase
|
|
* or lower the amount of allocated memory.
|
|
* Passsing zero clears the memory.
|
|
*/
|
|
error_code allocate(size_t len) noexcept;
|
|
/** @private Capacity in bytes, in terms
|
|
* of how many bytes of input JSON we can
|
|
* support.
|
|
*/
|
|
size_t capacity() const noexcept;
|
|
|
|
|
|
private:
|
|
size_t allocated_capacity{0};
|
|
friend class parser;
|
|
}; // class document
|
|
|
|
} // namespace dom
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_DOM_DOCUMENT_H
|
|
/* end file simdjson/dom/document.h */
|
|
/* including simdjson/internal/tape_type.h: #include <simdjson/internal/tape_type.h> */
|
|
/* begin file simdjson/internal/tape_type.h */
|
|
#ifndef SIMDJSON_INTERNAL_TAPE_TYPE_H
|
|
#define SIMDJSON_INTERNAL_TAPE_TYPE_H
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
/**
|
|
* The possible types in the tape.
|
|
*/
|
|
enum class tape_type {
|
|
ROOT = 'r',
|
|
START_ARRAY = '[',
|
|
START_OBJECT = '{',
|
|
END_ARRAY = ']',
|
|
END_OBJECT = '}',
|
|
STRING = '"',
|
|
INT64 = 'l',
|
|
UINT64 = 'u',
|
|
DOUBLE = 'd',
|
|
TRUE_VALUE = 't',
|
|
FALSE_VALUE = 'f',
|
|
NULL_VALUE = 'n'
|
|
}; // enum class tape_type
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_TAPE_TYPE_H
|
|
/* end file simdjson/internal/tape_type.h */
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_DEPENDENCIES_H
|
|
/* end file generic/stage2/dependencies.h */
|
|
|
|
/* including implementation.cpp: #include <implementation.cpp> */
|
|
/* begin file implementation.cpp */
|
|
#ifndef SIMDJSON_SRC_IMPLEMENTATION_CPP
|
|
#define SIMDJSON_SRC_IMPLEMENTATION_CPP
|
|
|
|
/* skipped duplicate #include <base.h> */
|
|
/* skipped duplicate #include <simdjson/generic/dependencies.h> */
|
|
/* skipped duplicate #include <simdjson/implementation.h> */
|
|
/* including internal/isadetection.h: #include <internal/isadetection.h> */
|
|
/* begin file internal/isadetection.h */
|
|
/* From
|
|
https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
|
|
Highly modified.
|
|
|
|
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
|
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
|
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
|
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
|
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
|
Copyright (c) 2011-2013 NYU (Clement Farabet)
|
|
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
|
|
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
|
|
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
|
|
Samy Bengio, Johnny Mariethoz)
|
|
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
|
|
America and IDIAP Research Institute nor the names of its contributors may be
|
|
used to endorse or promote products derived from this software without
|
|
specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef SIMDJSON_INTERNAL_ISADETECTION_H
|
|
#define SIMDJSON_INTERNAL_ISADETECTION_H
|
|
|
|
/* skipped duplicate #include "simdjson/internal/instruction_set.h" */
|
|
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#if defined(_MSC_VER)
|
|
#include <intrin.h>
|
|
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
|
|
#include <cpuid.h>
|
|
#endif
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
#if defined(__PPC64__)
|
|
|
|
static inline uint32_t detect_supported_architectures() {
|
|
return instruction_set::ALTIVEC;
|
|
}
|
|
|
|
#elif defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
|
|
|
static inline uint32_t detect_supported_architectures() {
|
|
return instruction_set::NEON;
|
|
}
|
|
|
|
#elif defined(__x86_64__) || defined(_M_AMD64) // x64
|
|
|
|
|
|
namespace {
|
|
// Can be found on Intel ISA Reference for CPUID
|
|
constexpr uint32_t cpuid_avx2_bit = 1 << 5; ///< @private Bit 5 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_bmi1_bit = 1 << 3; ///< @private bit 3 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_bmi2_bit = 1 << 8; ///< @private bit 8 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512f_bit = 1 << 16; ///< @private bit 16 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512ifma_bit = 1 << 21; ///< @private bit 21 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512pf_bit = 1 << 26; ///< @private bit 26 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512er_bit = 1 << 27; ///< @private bit 27 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512cd_bit = 1 << 28; ///< @private bit 28 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512vl_bit = 1U << 31; ///< @private bit 31 of EBX for EAX=0x7
|
|
constexpr uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7
|
|
constexpr uint64_t cpuid_avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
|
|
constexpr uint64_t cpuid_avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
|
|
constexpr uint32_t cpuid_sse42_bit = 1 << 20; ///< @private bit 20 of ECX for EAX=0x1
|
|
constexpr uint32_t cpuid_osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
|
|
constexpr uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit 1 of ECX for EAX=0x1
|
|
}
|
|
|
|
|
|
|
|
static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
|
|
uint32_t *edx) {
|
|
#if defined(_MSC_VER)
|
|
int cpu_info[4];
|
|
__cpuidex(cpu_info, *eax, *ecx);
|
|
*eax = cpu_info[0];
|
|
*ebx = cpu_info[1];
|
|
*ecx = cpu_info[2];
|
|
*edx = cpu_info[3];
|
|
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
|
|
uint32_t level = *eax;
|
|
__get_cpuid(level, eax, ebx, ecx, edx);
|
|
#else
|
|
uint32_t a = *eax, b, c = *ecx, d;
|
|
asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
|
|
*eax = a;
|
|
*ebx = b;
|
|
*ecx = c;
|
|
*edx = d;
|
|
#endif
|
|
}
|
|
|
|
|
|
static inline uint64_t xgetbv() {
|
|
#if defined(_MSC_VER)
|
|
return _xgetbv(0);
|
|
#else
|
|
uint32_t xcr0_lo, xcr0_hi;
|
|
asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
|
|
return xcr0_lo | (uint64_t(xcr0_hi) << 32);
|
|
#endif
|
|
}
|
|
|
|
static inline uint32_t detect_supported_architectures() {
|
|
uint32_t eax, ebx, ecx, edx;
|
|
uint32_t host_isa = 0x0;
|
|
|
|
// EBX for EAX=0x1
|
|
eax = 0x1;
|
|
ecx = 0x0;
|
|
cpuid(&eax, &ebx, &ecx, &edx);
|
|
|
|
if (ecx & cpuid_sse42_bit) {
|
|
host_isa |= instruction_set::SSE42;
|
|
} else {
|
|
return host_isa; // everything after is redundant
|
|
}
|
|
|
|
if (ecx & cpuid_pclmulqdq_bit) {
|
|
host_isa |= instruction_set::PCLMULQDQ;
|
|
}
|
|
|
|
|
|
if ((ecx & cpuid_osxsave) != cpuid_osxsave) {
|
|
return host_isa;
|
|
}
|
|
|
|
// xgetbv for checking if the OS saves registers
|
|
uint64_t xcr0 = xgetbv();
|
|
|
|
if ((xcr0 & cpuid_avx256_saved) == 0) {
|
|
return host_isa;
|
|
}
|
|
|
|
// ECX for EAX=0x7
|
|
eax = 0x7;
|
|
ecx = 0x0;
|
|
cpuid(&eax, &ebx, &ecx, &edx);
|
|
if (ebx & cpuid_avx2_bit) {
|
|
host_isa |= instruction_set::AVX2;
|
|
}
|
|
if (ebx & cpuid_bmi1_bit) {
|
|
host_isa |= instruction_set::BMI1;
|
|
}
|
|
|
|
if (ebx & cpuid_bmi2_bit) {
|
|
host_isa |= instruction_set::BMI2;
|
|
}
|
|
|
|
if (!((xcr0 & cpuid_avx512_saved) == cpuid_avx512_saved)) {
|
|
return host_isa;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512f_bit) {
|
|
host_isa |= instruction_set::AVX512F;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512dq_bit) {
|
|
host_isa |= instruction_set::AVX512DQ;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512ifma_bit) {
|
|
host_isa |= instruction_set::AVX512IFMA;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512pf_bit) {
|
|
host_isa |= instruction_set::AVX512PF;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512er_bit) {
|
|
host_isa |= instruction_set::AVX512ER;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512cd_bit) {
|
|
host_isa |= instruction_set::AVX512CD;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512bw_bit) {
|
|
host_isa |= instruction_set::AVX512BW;
|
|
}
|
|
|
|
if (ebx & cpuid_avx512vl_bit) {
|
|
host_isa |= instruction_set::AVX512VL;
|
|
}
|
|
|
|
if (ecx & cpuid_avx512vbmi2_bit) {
|
|
host_isa |= instruction_set::AVX512VBMI2;
|
|
}
|
|
|
|
return host_isa;
|
|
}
|
|
|
|
#elif defined(__loongarch_sx) && !defined(__loongarch_asx)
|
|
|
|
static inline uint32_t detect_supported_architectures() {
|
|
return instruction_set::LSX;
|
|
}
|
|
|
|
#elif defined(__loongarch_asx)
|
|
|
|
static inline uint32_t detect_supported_architectures() {
|
|
return instruction_set::LASX;
|
|
}
|
|
|
|
#else // fallback
|
|
|
|
|
|
static inline uint32_t detect_supported_architectures() {
|
|
return instruction_set::DEFAULT;
|
|
}
|
|
|
|
|
|
#endif // end SIMD extension detection code
|
|
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_INTERNAL_ISADETECTION_H
|
|
/* end file internal/isadetection.h */
|
|
|
|
#include <initializer_list>
|
|
|
|
namespace simdjson {
|
|
|
|
bool implementation::supported_by_runtime_system() const {
|
|
uint32_t required_instruction_sets = this->required_instruction_sets();
|
|
uint32_t supported_instruction_sets = internal::detect_supported_architectures();
|
|
return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
|
|
}
|
|
|
|
} // namespace simdjson
|
|
|
|
/* defining SIMDJSON_CONDITIONAL_INCLUDE */
|
|
#define SIMDJSON_CONDITIONAL_INCLUDE
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_ARM64
|
|
/* including simdjson/arm64/implementation.h: #include <simdjson/arm64/implementation.h> */
|
|
/* begin file simdjson/arm64/implementation.h */
|
|
#ifndef SIMDJSON_ARM64_IMPLEMENTATION_H
|
|
#define SIMDJSON_ARM64_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_IMPLEMENTATION_H
|
|
/* end file simdjson/arm64/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const arm64::implementation* get_arm64_singleton() {
|
|
static const arm64::implementation arm64_singleton{};
|
|
return &arm64_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif // SIMDJSON_IMPLEMENTATION_ARM64
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_FALLBACK
|
|
/* including simdjson/fallback/implementation.h: #include <simdjson/fallback/implementation.h> */
|
|
/* begin file simdjson/fallback/implementation.h */
|
|
#ifndef SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
|
#define SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation(
|
|
"fallback",
|
|
"Generic fallback implementation",
|
|
0
|
|
) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<simdjson::internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
|
/* end file simdjson/fallback/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const fallback::implementation* get_fallback_singleton() {
|
|
static const fallback::implementation fallback_singleton{};
|
|
return &fallback_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif // SIMDJSON_IMPLEMENTATION_FALLBACK
|
|
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_HASWELL
|
|
/* including simdjson/haswell/implementation.h: #include <simdjson/haswell/implementation.h> */
|
|
/* begin file simdjson/haswell/implementation.h */
|
|
#ifndef SIMDJSON_HASWELL_IMPLEMENTATION_H
|
|
#define SIMDJSON_HASWELL_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_HASWELL
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation(
|
|
"haswell",
|
|
"Intel/AMD AVX2",
|
|
internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
|
|
) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_IMPLEMENTATION_H
|
|
/* end file simdjson/haswell/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const haswell::implementation* get_haswell_singleton() {
|
|
static const haswell::implementation haswell_singleton{};
|
|
return &haswell_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
/* including simdjson/icelake/implementation.h: #include <simdjson/icelake/implementation.h> */
|
|
/* begin file simdjson/icelake/implementation.h */
|
|
#ifndef SIMDJSON_ICELAKE_IMPLEMENTATION_H
|
|
#define SIMDJSON_ICELAKE_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_ICELAKE
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation(
|
|
"icelake",
|
|
"Intel/AMD AVX512",
|
|
internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512F | internal::instruction_set::AVX512DQ | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2
|
|
) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_IMPLEMENTATION_H
|
|
/* end file simdjson/icelake/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const icelake::implementation* get_icelake_singleton() {
|
|
static const icelake::implementation icelake_singleton{};
|
|
return &icelake_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_PPC64
|
|
/* including simdjson/ppc64/implementation.h: #include <simdjson/ppc64/implementation.h> */
|
|
/* begin file simdjson/ppc64/implementation.h */
|
|
#ifndef SIMDJSON_PPC64_IMPLEMENTATION_H
|
|
#define SIMDJSON_PPC64_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
|
|
/**
|
|
* Implementation for ALTIVEC (PPC64).
|
|
*/
|
|
namespace ppc64 {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation()
|
|
: simdjson::implementation("ppc64", "PPC64 ALTIVEC",
|
|
internal::instruction_set::ALTIVEC) {}
|
|
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity, size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation> &dst)
|
|
const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len,
|
|
uint8_t *dst,
|
|
size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_IMPLEMENTATION_H
|
|
/* end file simdjson/ppc64/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const ppc64::implementation* get_ppc64_singleton() {
|
|
static const ppc64::implementation ppc64_singleton{};
|
|
return &ppc64_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif // SIMDJSON_IMPLEMENTATION_PPC64
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_WESTMERE
|
|
/* including simdjson/westmere/implementation.h: #include <simdjson/westmere/implementation.h> */
|
|
/* begin file simdjson/westmere/implementation.h */
|
|
#ifndef SIMDJSON_WESTMERE_IMPLEMENTATION_H
|
|
#define SIMDJSON_WESTMERE_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_IMPLEMENTATION_H
|
|
/* end file simdjson/westmere/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const simdjson::westmere::implementation* get_westmere_singleton() {
|
|
static const simdjson::westmere::implementation westmere_singleton{};
|
|
return &westmere_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif // SIMDJSON_IMPLEMENTATION_WESTMERE
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_LSX
|
|
/* including simdjson/lsx/implementation.h: #include <simdjson/lsx/implementation.h> */
|
|
/* begin file simdjson/lsx/implementation.h */
|
|
#ifndef SIMDJSON_LSX_IMPLEMENTATION_H
|
|
#define SIMDJSON_LSX_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("lsx", "LoongArch SX", internal::instruction_set::LSX) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_IMPLEMENTATION_H
|
|
/* end file simdjson/lsx/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const simdjson::lsx::implementation* get_lsx_singleton() {
|
|
static const simdjson::lsx::implementation lsx_singleton{};
|
|
return &lsx_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif // SIMDJSON_IMPLEMENTATION_LSX
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_LASX
|
|
/* including simdjson/lasx/implementation.h: #include <simdjson/lasx/implementation.h> */
|
|
/* begin file simdjson/lasx/implementation.h */
|
|
#ifndef SIMDJSON_LASX_IMPLEMENTATION_H
|
|
#define SIMDJSON_LASX_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("lasx", "LoongArch ASX", internal::instruction_set::LASX) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_IMPLEMENTATION_H
|
|
/* end file simdjson/lasx/implementation.h */
|
|
namespace simdjson {
|
|
namespace internal {
|
|
static const simdjson::lasx::implementation* get_lasx_singleton() {
|
|
static const simdjson::lasx::implementation lasx_singleton{};
|
|
return &lasx_singleton;
|
|
}
|
|
} // namespace internal
|
|
} // namespace simdjson
|
|
#endif // SIMDJSON_IMPLEMENTATION_LASX
|
|
|
|
/* undefining SIMDJSON_CONDITIONAL_INCLUDE */
|
|
#undef SIMDJSON_CONDITIONAL_INCLUDE
|
|
|
|
namespace simdjson {
|
|
namespace internal {
|
|
|
|
// When there is a single implementation, we should not pay a price
|
|
// for dispatching to the best implementation. We should just use the
|
|
// one we have. This is a compile-time check.
|
|
#define SIMDJSON_SINGLE_IMPLEMENTATION (SIMDJSON_IMPLEMENTATION_ICELAKE \
|
|
+ SIMDJSON_IMPLEMENTATION_HASWELL + SIMDJSON_IMPLEMENTATION_WESTMERE \
|
|
+ SIMDJSON_IMPLEMENTATION_ARM64 + SIMDJSON_IMPLEMENTATION_PPC64 \
|
|
+ SIMDJSON_IMPLEMENTATION_LSX + SIMDJSON_IMPLEMENTATION_LASX \
|
|
+ SIMDJSON_IMPLEMENTATION_FALLBACK == 1)
|
|
|
|
#if SIMDJSON_SINGLE_IMPLEMENTATION
|
|
static const implementation* get_single_implementation() {
|
|
return
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
get_icelake_singleton();
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_HASWELL
|
|
get_haswell_singleton();
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_WESTMERE
|
|
get_westmere_singleton();
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_ARM64
|
|
get_arm64_singleton();
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_PPC64
|
|
get_ppc64_singleton();
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_LSX
|
|
get_lsx_singleton();
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_LASX
|
|
get_lasx_singleton();
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_FALLBACK
|
|
get_fallback_singleton();
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
// Static array of known implementations. We're hoping these get baked into the executable
|
|
// without requiring a static initializer.
|
|
|
|
/**
|
|
* @private Detects best supported implementation on first use, and sets it
|
|
*/
|
|
class detect_best_supported_implementation_on_first_use final : public implementation {
|
|
public:
|
|
std::string name() const noexcept final { return set_best()->name(); }
|
|
std::string description() const noexcept final { return set_best()->description(); }
|
|
uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final {
|
|
return set_best()->create_dom_parser_implementation(capacity, max_length, dst);
|
|
}
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final {
|
|
return set_best()->minify(buf, len, dst, dst_len);
|
|
}
|
|
simdjson_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
|
|
return set_best()->validate_utf8(buf, len);
|
|
}
|
|
simdjson_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
|
|
private:
|
|
const implementation *set_best() const noexcept;
|
|
};
|
|
|
|
static_assert(std::is_trivially_destructible<detect_best_supported_implementation_on_first_use>::value, "detect_best_supported_implementation_on_first_use should be trivially destructible");
|
|
|
|
static const std::initializer_list<const implementation *>& get_available_implementation_pointers() {
|
|
static const std::initializer_list<const implementation *> available_implementation_pointers {
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
get_icelake_singleton(),
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_HASWELL
|
|
get_haswell_singleton(),
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_WESTMERE
|
|
get_westmere_singleton(),
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_ARM64
|
|
get_arm64_singleton(),
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_PPC64
|
|
get_ppc64_singleton(),
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_LSX
|
|
get_lsx_singleton(),
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_LASX
|
|
get_lasx_singleton(),
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_FALLBACK
|
|
get_fallback_singleton(),
|
|
#endif
|
|
}; // available_implementation_pointers
|
|
return available_implementation_pointers;
|
|
}
|
|
|
|
// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
|
|
class unsupported_implementation final : public implementation {
|
|
public:
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t,
|
|
size_t,
|
|
std::unique_ptr<internal::dom_parser_implementation>&
|
|
) const noexcept final {
|
|
return UNSUPPORTED_ARCHITECTURE;
|
|
}
|
|
simdjson_warn_unused error_code minify(const uint8_t *, size_t, uint8_t *, size_t &) const noexcept final override {
|
|
return UNSUPPORTED_ARCHITECTURE;
|
|
}
|
|
simdjson_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
|
|
return false; // Just refuse to validate. Given that we have a fallback implementation
|
|
// it seems unlikely that unsupported_implementation will ever be used. If it is used,
|
|
// then it will flag all strings as invalid. The alternative is to return an error_code
|
|
// from which the user has to figure out whether the string is valid UTF-8... which seems
|
|
// like a lot of work just to handle the very unlikely case that we have an unsupported
|
|
// implementation. And, when it does happen (that we have an unsupported implementation),
|
|
// what are the chances that the programmer has a fallback? Given that *we* provide the
|
|
// fallback, it implies that the programmer would need a fallback for our fallback.
|
|
}
|
|
unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
|
|
};
|
|
|
|
static_assert(std::is_trivially_destructible<unsupported_implementation>::value, "unsupported_singleton should be trivially destructible");
|
|
|
|
const unsupported_implementation* get_unsupported_singleton() {
|
|
static const unsupported_implementation unsupported_singleton{};
|
|
return &unsupported_singleton;
|
|
}
|
|
|
|
size_t available_implementation_list::size() const noexcept {
|
|
return internal::get_available_implementation_pointers().size();
|
|
}
|
|
const implementation * const *available_implementation_list::begin() const noexcept {
|
|
return internal::get_available_implementation_pointers().begin();
|
|
}
|
|
const implementation * const *available_implementation_list::end() const noexcept {
|
|
return internal::get_available_implementation_pointers().end();
|
|
}
|
|
const implementation *available_implementation_list::detect_best_supported() const noexcept {
|
|
// They are prelisted in priority order, so we just go down the list
|
|
uint32_t supported_instruction_sets = internal::detect_supported_architectures();
|
|
for (const implementation *impl : internal::get_available_implementation_pointers()) {
|
|
uint32_t required_instruction_sets = impl->required_instruction_sets();
|
|
if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
|
|
}
|
|
return get_unsupported_singleton(); // this should never happen?
|
|
}
|
|
|
|
const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
|
|
SIMDJSON_PUSH_DISABLE_WARNINGS
|
|
SIMDJSON_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
|
|
char *force_implementation_name = getenv("SIMDJSON_FORCE_IMPLEMENTATION");
|
|
SIMDJSON_POP_DISABLE_WARNINGS
|
|
|
|
if (force_implementation_name) {
|
|
auto force_implementation = get_available_implementations()[force_implementation_name];
|
|
if (force_implementation) {
|
|
return get_active_implementation() = force_implementation;
|
|
} else {
|
|
// Note: abort() and stderr usage within the library is forbidden.
|
|
return get_active_implementation() = get_unsupported_singleton();
|
|
}
|
|
}
|
|
return get_active_implementation() = get_available_implementations().detect_best_supported();
|
|
}
|
|
|
|
} // namespace internal
|
|
|
|
SIMDJSON_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
|
|
static const internal::available_implementation_list available_implementations{};
|
|
return available_implementations;
|
|
}
|
|
|
|
SIMDJSON_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
|
|
#if SIMDJSON_SINGLE_IMPLEMENTATION
|
|
// We immediately select the only implementation we have, skipping the
|
|
// detect_best_supported_implementation_on_first_use_singleton.
|
|
static internal::atomic_ptr<const implementation> active_implementation{internal::get_single_implementation()};
|
|
return active_implementation;
|
|
#else
|
|
static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
|
|
static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
|
|
return active_implementation;
|
|
#endif
|
|
}
|
|
|
|
simdjson_warn_unused error_code minify(const char *buf, size_t len, char *dst, size_t &dst_len) noexcept {
|
|
return get_active_implementation()->minify(reinterpret_cast<const uint8_t *>(buf), len, reinterpret_cast<uint8_t *>(dst), dst_len);
|
|
}
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
|
|
return get_active_implementation()->validate_utf8(buf, len);
|
|
}
|
|
const implementation * builtin_implementation() {
|
|
static const implementation * builtin_impl = get_available_implementations()[SIMDJSON_STRINGIFY(SIMDJSON_BUILTIN_IMPLEMENTATION)];
|
|
assert(builtin_impl);
|
|
return builtin_impl;
|
|
}
|
|
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_IMPLEMENTATION_CPP
|
|
/* end file implementation.cpp */
|
|
|
|
/* defining SIMDJSON_CONDITIONAL_INCLUDE */
|
|
#define SIMDJSON_CONDITIONAL_INCLUDE
|
|
|
|
#if SIMDJSON_IMPLEMENTATION_ARM64
|
|
/* including arm64.cpp: #include <arm64.cpp> */
|
|
/* begin file arm64.cpp */
|
|
#ifndef SIMDJSON_SRC_ARM64_CPP
|
|
#define SIMDJSON_SRC_ARM64_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/arm64.h: #include <simdjson/arm64.h> */
|
|
/* begin file simdjson/arm64.h */
|
|
#ifndef SIMDJSON_ARM64_H
|
|
#define SIMDJSON_ARM64_H
|
|
|
|
/* including simdjson/arm64/begin.h: #include "simdjson/arm64/begin.h" */
|
|
/* begin file simdjson/arm64/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "arm64" */
|
|
#define SIMDJSON_IMPLEMENTATION arm64
|
|
/* including simdjson/arm64/base.h: #include "simdjson/arm64/base.h" */
|
|
/* begin file simdjson/arm64/base.h */
|
|
#ifndef SIMDJSON_ARM64_BASE_H
|
|
#define SIMDJSON_ARM64_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for NEON (ARMv8).
|
|
*/
|
|
namespace arm64 {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_BASE_H
|
|
/* end file simdjson/arm64/base.h */
|
|
/* including simdjson/arm64/intrinsics.h: #include "simdjson/arm64/intrinsics.h" */
|
|
/* begin file simdjson/arm64/intrinsics.h */
|
|
#ifndef SIMDJSON_ARM64_INTRINSICS_H
|
|
#define SIMDJSON_ARM64_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <arm_neon.h>
|
|
|
|
static_assert(sizeof(uint8x16_t) <= simdjson::SIMDJSON_PADDING, "insufficient padding for arm64");
|
|
|
|
#endif // SIMDJSON_ARM64_INTRINSICS_H
|
|
/* end file simdjson/arm64/intrinsics.h */
|
|
/* including simdjson/arm64/bitmanipulation.h: #include "simdjson/arm64/bitmanipulation.h" */
|
|
/* begin file simdjson/arm64/bitmanipulation.h */
|
|
#ifndef SIMDJSON_ARM64_BITMANIPULATION_H
|
|
#define SIMDJSON_ARM64_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
|
|
}
|
|
|
|
|
|
#if defined(__GNUC__) // catches clang and gcc
|
|
/**
|
|
* ARM has a fast 64-bit "bit reversal function" that is handy. However,
|
|
* it is not generally available as an intrinsic function under Visual
|
|
* Studio (though this might be changing). Even under clang/gcc, we
|
|
* apparently need to invoke inline assembly.
|
|
*/
|
|
/*
|
|
* We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that
|
|
* work well with bit reversal may use it.
|
|
*/
|
|
#define SIMDJSON_PREFER_REVERSE_BITS 1
|
|
|
|
/* reverse the bits */
|
|
simdjson_inline uint64_t reverse_bits(uint64_t input_num) {
|
|
uint64_t rev_bits;
|
|
__asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num));
|
|
return rev_bits;
|
|
}
|
|
|
|
/**
|
|
* Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes,
|
|
* then this will set to zero the leading bit. It is possible for leading_zeroes to be
|
|
* greating or equal to 63 in which case we trigger undefined behavior, but the output
|
|
* of such undefined behavior is never used.
|
|
**/
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
simdjson_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) {
|
|
return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes);
|
|
}
|
|
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
*result = value1 + value2;
|
|
return *result < value1;
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_BITMANIPULATION_H
|
|
/* end file simdjson/arm64/bitmanipulation.h */
|
|
/* including simdjson/arm64/bitmask.h: #include "simdjson/arm64/bitmask.h" */
|
|
/* begin file simdjson/arm64/bitmask.h */
|
|
#ifndef SIMDJSON_ARM64_BITMASK_H
|
|
#define SIMDJSON_ARM64_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
/////////////
|
|
// We could do this with PMULL, but it is apparently slow.
|
|
//
|
|
//#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
|
|
//return vmull_p64(-1ULL, bitmask);
|
|
//#else
|
|
// Analysis by @sebpop:
|
|
// When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
|
|
// in between other vector code, so effectively the extra cycles of the sequence do not matter
|
|
// because the GPR units are idle otherwise and the critical path is on the FP side.
|
|
// Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 )
|
|
// and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
|
|
///////////
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/arm64/bitmask.h */
|
|
/* including simdjson/arm64/numberparsing_defs.h: #include "simdjson/arm64/numberparsing_defs.h" */
|
|
/* begin file simdjson/arm64/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_ARM64_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_ARM64_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO && SIMDJSON_IS_ARM64
|
|
// __umulh requires intrin.h
|
|
#include <intrin.h>
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO && SIMDJSON_IS_ARM64
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace numberparsing {
|
|
|
|
// we don't have SSE, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_ARM64_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/arm64/numberparsing_defs.h */
|
|
/* including simdjson/arm64/simd.h: #include "simdjson/arm64/simd.h" */
|
|
/* begin file simdjson/arm64/simd.h */
|
|
#ifndef SIMDJSON_ARM64_SIMD_H
|
|
#define SIMDJSON_ARM64_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
namespace {
|
|
// Start of private section with Visual Studio workaround
|
|
|
|
|
|
#ifndef simdjson_make_uint8x16_t
|
|
#define simdjson_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
|
|
x13, x14, x15, x16) \
|
|
([=]() { \
|
|
uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
|
|
x9, x10, x11, x12, x13, x14, x15, x16}; \
|
|
return vld1q_u8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_int8x16_t
|
|
#define simdjson_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
|
|
x13, x14, x15, x16) \
|
|
([=]() { \
|
|
int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
|
|
x9, x10, x11, x12, x13, x14, x15, x16}; \
|
|
return vld1q_s8(array); \
|
|
}())
|
|
#endif
|
|
|
|
#ifndef simdjson_make_uint8x8_t
|
|
#define simdjson_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1_u8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_int8x8_t
|
|
#define simdjson_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1_s8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_uint16x8_t
|
|
#define simdjson_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1q_u16(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_int16x8_t
|
|
#define simdjson_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1q_s16(array); \
|
|
}())
|
|
#endif
|
|
|
|
// End of private section with Visual Studio workaround
|
|
} // namespace
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
|
|
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
//
|
|
// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
|
|
//
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base_u8 {
|
|
uint8x16_t value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// Conversion from/to SIMD register
|
|
simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {}
|
|
simdjson_inline operator const uint8x16_t&() const { return this->value; }
|
|
simdjson_inline operator uint8x16_t&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
|
|
simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
|
|
simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
|
|
simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
simdjson_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return vextq_u8(prev_chunk, *this, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base_u8<bool> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
|
|
// False constructor
|
|
simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : simd8(splat(_value)) {}
|
|
|
|
// We return uint32_t instead of uint16_t because that seems to be more efficient for most
|
|
// purposes (cutting it down to uint16_t costs performance in some compilers).
|
|
simdjson_inline uint32_t to_bitmask() const {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask = simdjson_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
auto minput = *this & bit_mask;
|
|
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
|
|
}
|
|
simdjson_inline bool any() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0; }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base_u8<uint8_t> {
|
|
static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
|
|
static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(0); }
|
|
static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
|
|
|
|
simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
|
|
// Zero constructor
|
|
simdjson_inline simd8() : simd8(zero()) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(simdjson_make_uint8x16_t(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
#else
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(uint8x16_t{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
#endif
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
|
|
simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
|
|
simdjson_inline uint8_t min_val() const { return vminvq_u8(*this); }
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
|
|
// Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
|
|
// Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint16_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
|
|
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
|
|
// we increment by 0x08 the second half of the mask
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
uint8x16_t inc = simdjson_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
|
|
#else
|
|
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
|
|
#endif
|
|
shufmask = vaddq_u8(shufmask, inc);
|
|
// this is the version "nearly pruned"
|
|
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
|
|
uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
|
|
vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
|
|
}
|
|
|
|
// Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
|
|
// bitset) to output1, then those corresponding to a 0 in the high half to output2.
|
|
template<typename L>
|
|
simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
|
|
using internal::thintable_epi8;
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
|
|
uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
|
|
// we increment by 0x08 the second half of the mask
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
uint8x8_t inc = simdjson_make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
|
|
#else
|
|
uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
|
|
#endif
|
|
compactmask2 = vadd_u8(compactmask2, inc);
|
|
// store each result (with the second store possibly overlapping the first)
|
|
vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
|
|
vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
|
|
return vqtbl1q_u8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> {
|
|
int8x16_t value;
|
|
|
|
static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
|
|
static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
|
|
static simdjson_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
|
|
|
|
// Conversion from/to SIMD register
|
|
simdjson_inline simd8(const int8x16_t _value) : value{_value} {}
|
|
simdjson_inline operator const int8x16_t&() const { return this->value; }
|
|
simdjson_inline operator int8x16_t&() { return this->value; }
|
|
|
|
// Zero constructor
|
|
simdjson_inline simd8() : simd8(zero()) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(simdjson_make_int8x16_t(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
#else
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(int8x16_t{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
#endif
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
|
|
|
|
// Explicit conversion to/from unsigned
|
|
//
|
|
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
|
|
// In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
|
|
// and relatively ugly and hard to read.
|
|
#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
|
|
#endif
|
|
simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
|
|
|
|
// Math
|
|
simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
|
|
simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
|
|
simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
|
|
simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
|
|
simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
|
|
return vextq_s8(prev_chunk, *this, 16 - N);
|
|
}
|
|
|
|
// Perform a lookup assuming no value is larger than 16
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
|
|
return vqtbl1q_s8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
|
|
// compute the prefix sum of the popcounts of each byte
|
|
uint64_t offsets = popcounts * 0x0101010101010101;
|
|
this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
|
|
this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
|
|
this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
|
|
this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
|
|
return offsets >> 56;
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask = simdjson_make_uint8x16_t(
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
);
|
|
#else
|
|
const uint8x16_t bit_mask = {
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
};
|
|
#endif
|
|
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
|
|
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
|
|
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_SIMD_H
|
|
/* end file simdjson/arm64/simd.h */
|
|
/* including simdjson/arm64/stringparsing_defs.h: #include "simdjson/arm64/stringparsing_defs.h" */
|
|
/* begin file simdjson/arm64/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_ARM64_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_ARM64_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + sizeof(v0));
|
|
v0.store(dst);
|
|
v1.store(dst + sizeof(v0));
|
|
|
|
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we
|
|
// smash them together into a 64-byte mask and get the bitmask from there.
|
|
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/arm64/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/arm64/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for arm64: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for arm64 */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for arm64: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for arm64 */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for arm64 */
|
|
/* including simdjson/generic/jsoncharutils.h for arm64: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for arm64 */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for arm64 */
|
|
/* including simdjson/generic/atomparsing.h for arm64: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for arm64 */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for arm64 */
|
|
/* including simdjson/generic/dom_parser_implementation.h for arm64: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for arm64 */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for arm64 */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for arm64: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for arm64 */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for arm64 */
|
|
/* including simdjson/generic/numberparsing.h for arm64: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for arm64 */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for arm64 */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for arm64: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for arm64 */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for arm64 */
|
|
/* end file simdjson/generic/amalgamated.h for arm64 */
|
|
/* including simdjson/arm64/end.h: #include "simdjson/arm64/end.h" */
|
|
/* begin file simdjson/arm64/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "arm64" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/arm64/end.h */
|
|
|
|
#endif // SIMDJSON_ARM64_H
|
|
/* end file simdjson/arm64.h */
|
|
/* including simdjson/arm64/implementation.h: #include <simdjson/arm64/implementation.h> */
|
|
/* begin file simdjson/arm64/implementation.h */
|
|
#ifndef SIMDJSON_ARM64_IMPLEMENTATION_H
|
|
#define SIMDJSON_ARM64_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_IMPLEMENTATION_H
|
|
/* end file simdjson/arm64/implementation.h */
|
|
|
|
/* including simdjson/arm64/begin.h: #include <simdjson/arm64/begin.h> */
|
|
/* begin file simdjson/arm64/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "arm64" */
|
|
#define SIMDJSON_IMPLEMENTATION arm64
|
|
/* including simdjson/arm64/base.h: #include "simdjson/arm64/base.h" */
|
|
/* begin file simdjson/arm64/base.h */
|
|
#ifndef SIMDJSON_ARM64_BASE_H
|
|
#define SIMDJSON_ARM64_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for NEON (ARMv8).
|
|
*/
|
|
namespace arm64 {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_BASE_H
|
|
/* end file simdjson/arm64/base.h */
|
|
/* including simdjson/arm64/intrinsics.h: #include "simdjson/arm64/intrinsics.h" */
|
|
/* begin file simdjson/arm64/intrinsics.h */
|
|
#ifndef SIMDJSON_ARM64_INTRINSICS_H
|
|
#define SIMDJSON_ARM64_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <arm_neon.h>
|
|
|
|
static_assert(sizeof(uint8x16_t) <= simdjson::SIMDJSON_PADDING, "insufficient padding for arm64");
|
|
|
|
#endif // SIMDJSON_ARM64_INTRINSICS_H
|
|
/* end file simdjson/arm64/intrinsics.h */
|
|
/* including simdjson/arm64/bitmanipulation.h: #include "simdjson/arm64/bitmanipulation.h" */
|
|
/* begin file simdjson/arm64/bitmanipulation.h */
|
|
#ifndef SIMDJSON_ARM64_BITMANIPULATION_H
|
|
#define SIMDJSON_ARM64_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
|
|
}
|
|
|
|
|
|
#if defined(__GNUC__) // catches clang and gcc
|
|
/**
|
|
* ARM has a fast 64-bit "bit reversal function" that is handy. However,
|
|
* it is not generally available as an intrinsic function under Visual
|
|
* Studio (though this might be changing). Even under clang/gcc, we
|
|
* apparently need to invoke inline assembly.
|
|
*/
|
|
/*
|
|
* We use SIMDJSON_PREFER_REVERSE_BITS as a hint that algorithms that
|
|
* work well with bit reversal may use it.
|
|
*/
|
|
#define SIMDJSON_PREFER_REVERSE_BITS 1
|
|
|
|
/* reverse the bits */
|
|
simdjson_inline uint64_t reverse_bits(uint64_t input_num) {
|
|
uint64_t rev_bits;
|
|
__asm("rbit %0, %1" : "=r"(rev_bits) : "r"(input_num));
|
|
return rev_bits;
|
|
}
|
|
|
|
/**
|
|
* Flips bit at index 63 - lz. Thus if you have 'leading_zeroes' leading zeroes,
|
|
* then this will set to zero the leading bit. It is possible for leading_zeroes to be
|
|
* greating or equal to 63 in which case we trigger undefined behavior, but the output
|
|
* of such undefined behavior is never used.
|
|
**/
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
simdjson_inline uint64_t zero_leading_bit(uint64_t rev_bits, int leading_zeroes) {
|
|
return rev_bits ^ (uint64_t(0x8000000000000000) >> leading_zeroes);
|
|
}
|
|
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
*result = value1 + value2;
|
|
return *result < value1;
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_BITMANIPULATION_H
|
|
/* end file simdjson/arm64/bitmanipulation.h */
|
|
/* including simdjson/arm64/bitmask.h: #include "simdjson/arm64/bitmask.h" */
|
|
/* begin file simdjson/arm64/bitmask.h */
|
|
#ifndef SIMDJSON_ARM64_BITMASK_H
|
|
#define SIMDJSON_ARM64_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
/////////////
|
|
// We could do this with PMULL, but it is apparently slow.
|
|
//
|
|
//#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
|
|
//return vmull_p64(-1ULL, bitmask);
|
|
//#else
|
|
// Analysis by @sebpop:
|
|
// When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out
|
|
// in between other vector code, so effectively the extra cycles of the sequence do not matter
|
|
// because the GPR units are idle otherwise and the critical path is on the FP side.
|
|
// Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 )
|
|
// and FP->GPR (2 cycles on N1 and 5 cycles on A72.)
|
|
///////////
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/arm64/bitmask.h */
|
|
/* including simdjson/arm64/numberparsing_defs.h: #include "simdjson/arm64/numberparsing_defs.h" */
|
|
/* begin file simdjson/arm64/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_ARM64_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_ARM64_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO && SIMDJSON_IS_ARM64
|
|
// __umulh requires intrin.h
|
|
#include <intrin.h>
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO && SIMDJSON_IS_ARM64
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace numberparsing {
|
|
|
|
// we don't have SSE, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_ARM64_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/arm64/numberparsing_defs.h */
|
|
/* including simdjson/arm64/simd.h: #include "simdjson/arm64/simd.h" */
|
|
/* begin file simdjson/arm64/simd.h */
|
|
#ifndef SIMDJSON_ARM64_SIMD_H
|
|
#define SIMDJSON_ARM64_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
namespace {
|
|
// Start of private section with Visual Studio workaround
|
|
|
|
|
|
#ifndef simdjson_make_uint8x16_t
|
|
#define simdjson_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
|
|
x13, x14, x15, x16) \
|
|
([=]() { \
|
|
uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
|
|
x9, x10, x11, x12, x13, x14, x15, x16}; \
|
|
return vld1q_u8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_int8x16_t
|
|
#define simdjson_make_int8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
|
|
x13, x14, x15, x16) \
|
|
([=]() { \
|
|
int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
|
|
x9, x10, x11, x12, x13, x14, x15, x16}; \
|
|
return vld1q_s8(array); \
|
|
}())
|
|
#endif
|
|
|
|
#ifndef simdjson_make_uint8x8_t
|
|
#define simdjson_make_uint8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
uint8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1_u8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_int8x8_t
|
|
#define simdjson_make_int8x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
int8_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1_s8(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_uint16x8_t
|
|
#define simdjson_make_uint16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
uint16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1q_u16(array); \
|
|
}())
|
|
#endif
|
|
#ifndef simdjson_make_int16x8_t
|
|
#define simdjson_make_int16x8_t(x1, x2, x3, x4, x5, x6, x7, x8) \
|
|
([=]() { \
|
|
int16_t array[8] = {x1, x2, x3, x4, x5, x6, x7, x8}; \
|
|
return vld1q_s16(array); \
|
|
}())
|
|
#endif
|
|
|
|
// End of private section with Visual Studio workaround
|
|
} // namespace
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
|
|
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
//
|
|
// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
|
|
//
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base_u8 {
|
|
uint8x16_t value;
|
|
static const int SIZE = sizeof(value);
|
|
|
|
// Conversion from/to SIMD register
|
|
simdjson_inline base_u8(const uint8x16_t _value) : value(_value) {}
|
|
simdjson_inline operator const uint8x16_t&() const { return this->value; }
|
|
simdjson_inline operator uint8x16_t&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
|
|
simdjson_inline simd8<T> operator&(const simd8<T> other) const { return vandq_u8(*this, other); }
|
|
simdjson_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
|
|
simdjson_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
simdjson_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return vextq_u8(prev_chunk, *this, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base_u8<bool> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
|
|
// False constructor
|
|
simdjson_inline simd8() : simd8(vdupq_n_u8(0)) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : simd8(splat(_value)) {}
|
|
|
|
// We return uint32_t instead of uint16_t because that seems to be more efficient for most
|
|
// purposes (cutting it down to uint16_t costs performance in some compilers).
|
|
simdjson_inline uint32_t to_bitmask() const {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask = simdjson_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
|
|
#else
|
|
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
#endif
|
|
auto minput = *this & bit_mask;
|
|
uint8x16_t tmp = vpaddq_u8(minput, minput);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
tmp = vpaddq_u8(tmp, tmp);
|
|
return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
|
|
}
|
|
simdjson_inline bool any() const { return vmaxvq_u32(vreinterpretq_u32_u8(*this)) != 0; }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base_u8<uint8_t> {
|
|
static simdjson_inline uint8x16_t splat(uint8_t _value) { return vmovq_n_u8(_value); }
|
|
static simdjson_inline uint8x16_t zero() { return vdupq_n_u8(0); }
|
|
static simdjson_inline uint8x16_t load(const uint8_t* values) { return vld1q_u8(values); }
|
|
|
|
simdjson_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
|
|
// Zero constructor
|
|
simdjson_inline simd8() : simd8(zero()) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(simdjson_make_uint8x16_t(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
#else
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(uint8x16_t{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
#endif
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return vqaddq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return vqsubq_u8(*this, other); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
|
|
simdjson_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
|
|
simdjson_inline uint8_t min_val() const { return vminvq_u8(*this); }
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return vmaxq_u8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return vminq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return vcleq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return vcgeq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return vcltq_u8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return vcgtq_u8(*this, other); }
|
|
// Same as >, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this > other); }
|
|
// Same as <, but instead of guaranteeing all 1's == true, false = 0 and true = nonzero. For ARM, returns all 1's.
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return simd8<uint8_t>(*this < other); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return vtstq_u8(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return this->max_val() != 0; }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return (*this & bits).any_bits_set_anywhere(); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return vshrq_n_u8(*this, N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return vshlq_n_u8(*this, N); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint16_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
|
|
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
|
|
// we increment by 0x08 the second half of the mask
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
uint8x16_t inc = simdjson_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
|
|
#else
|
|
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
|
|
#endif
|
|
shufmask = vaddq_u8(shufmask, inc);
|
|
// this is the version "nearly pruned"
|
|
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
uint8x16_t compactmask = vld1q_u8(reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
|
|
uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
|
|
vst1q_u8(reinterpret_cast<uint8_t*>(output), answer);
|
|
}
|
|
|
|
// Copies all bytes corresponding to a 0 in the low half of the mask (interpreted as a
|
|
// bitset) to output1, then those corresponding to a 0 in the high half to output2.
|
|
template<typename L>
|
|
simdjson_inline void compress_halves(uint16_t mask, L *output1, L *output2) const {
|
|
using internal::thintable_epi8;
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
uint8x8_t compactmask1 = vcreate_u8(thintable_epi8[mask1]);
|
|
uint8x8_t compactmask2 = vcreate_u8(thintable_epi8[mask2]);
|
|
// we increment by 0x08 the second half of the mask
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
uint8x8_t inc = simdjson_make_uint8x8_t(0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
|
|
#else
|
|
uint8x8_t inc = {0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
|
|
#endif
|
|
compactmask2 = vadd_u8(compactmask2, inc);
|
|
// store each result (with the second store possibly overlapping the first)
|
|
vst1_u8((uint8_t*)output1, vqtbl1_u8(*this, compactmask1));
|
|
vst1_u8((uint8_t*)output2, vqtbl1_u8(*this, compactmask2));
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) {
|
|
return vqtbl1q_u8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> {
|
|
int8x16_t value;
|
|
|
|
static simdjson_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
|
|
static simdjson_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
|
|
static simdjson_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
|
|
|
|
// Conversion from/to SIMD register
|
|
simdjson_inline simd8(const int8x16_t _value) : value{_value} {}
|
|
simdjson_inline operator const int8x16_t&() const { return this->value; }
|
|
simdjson_inline operator int8x16_t&() { return this->value; }
|
|
|
|
// Zero constructor
|
|
simdjson_inline simd8() : simd8(zero()) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(simdjson_make_int8x16_t(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
#else
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(int8x16_t{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
#endif
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
|
|
|
|
// Explicit conversion to/from unsigned
|
|
//
|
|
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
|
|
// In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
|
|
// and relatively ugly and hard to read.
|
|
#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
|
|
#endif
|
|
simdjson_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
|
|
|
|
// Math
|
|
simdjson_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
|
|
simdjson_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(*this, other); }
|
|
simdjson_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
|
|
simdjson_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return vmaxq_s8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return vminq_s8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return vcgtq_s8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(*this, other); }
|
|
simdjson_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(*this, other); }
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
|
|
return vextq_s8(prev_chunk, *this, 16 - N);
|
|
}
|
|
|
|
// Perform a lookup assuming no value is larger than 16
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return lookup_table.apply_lookup_16_to(*this);
|
|
}
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
|
|
return vqtbl1q_s8(*this, simd8<uint8_t>(original));
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint64_t popcounts = vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
|
|
// compute the prefix sum of the popcounts of each byte
|
|
uint64_t offsets = popcounts * 0x0101010101010101;
|
|
this->chunks[0].compress_halves(uint16_t(mask), output, &output[popcounts & 0xFF]);
|
|
this->chunks[1].compress_halves(uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF], &output[(offsets >> 16) & 0xFF]);
|
|
this->chunks[2].compress_halves(uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF], &output[(offsets >> 32) & 0xFF]);
|
|
this->chunks[3].compress_halves(uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF], &output[(offsets >> 48) & 0xFF]);
|
|
return offsets >> 56;
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
const uint8x16_t bit_mask = simdjson_make_uint8x16_t(
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
);
|
|
#else
|
|
const uint8x16_t bit_mask = {
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
|
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
|
|
};
|
|
#endif
|
|
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
|
|
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
|
|
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_SIMD_H
|
|
/* end file simdjson/arm64/simd.h */
|
|
/* including simdjson/arm64/stringparsing_defs.h: #include "simdjson/arm64/stringparsing_defs.h" */
|
|
/* begin file simdjson/arm64/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_ARM64_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_ARM64_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + sizeof(v0));
|
|
v0.store(dst);
|
|
v1.store(dst + sizeof(v0));
|
|
|
|
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on ARM; therefore, we
|
|
// smash them together into a 64-byte mask and get the bitmask from there.
|
|
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ARM64_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/arm64/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/arm64/begin.h */
|
|
/* including generic/amalgamated.h for arm64: #include <generic/amalgamated.h> */
|
|
/* begin file generic/amalgamated.h for arm64 */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H)
|
|
#error generic/dependencies.h must be included before generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including generic/base.h for arm64: #include <generic/base.h> */
|
|
/* begin file generic/base.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
struct json_character_block;
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_BASE_H
|
|
/* end file generic/base.h for arm64 */
|
|
/* including generic/dom_parser_implementation.h for arm64: #include <generic/dom_parser_implementation.h> */
|
|
/* begin file generic/dom_parser_implementation.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// Interface a dom parser implementation must fulfill
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3);
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input);
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file generic/dom_parser_implementation.h for arm64 */
|
|
/* including generic/json_character_block.h for arm64: #include <generic/json_character_block.h> */
|
|
/* begin file generic/json_character_block.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
|
|
struct json_character_block {
|
|
static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
|
|
|
simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
|
|
simdjson_inline uint64_t op() const noexcept { return _op; }
|
|
simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
|
|
|
|
uint64_t _whitespace;
|
|
uint64_t _op;
|
|
};
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
/* end file generic/json_character_block.h for arm64 */
|
|
/* end file generic/amalgamated.h for arm64 */
|
|
/* including generic/stage1/amalgamated.h for arm64: #include <generic/stage1/amalgamated.h> */
|
|
/* begin file generic/stage1/amalgamated.h for arm64 */
|
|
// Stuff other things depend on
|
|
/* including generic/stage1/base.h for arm64: #include <generic/stage1/base.h> */
|
|
/* begin file generic/stage1/base.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer;
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader;
|
|
struct json_block;
|
|
class json_minifier;
|
|
class json_scanner;
|
|
struct json_string_block;
|
|
class json_string_scanner;
|
|
class json_structural_indexer;
|
|
|
|
} // namespace stage1
|
|
|
|
namespace utf8_validation {
|
|
struct utf8_checker;
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
/* end file generic/stage1/base.h for arm64 */
|
|
/* including generic/stage1/buf_block_reader.h for arm64: #include <generic/stage1/buf_block_reader.h> */
|
|
/* begin file generic/stage1/buf_block_reader.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdjson_inline size_t block_index();
|
|
simdjson_inline bool has_full_block() const;
|
|
simdjson_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdjson_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdjson_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in, uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] <= ' ') { buf[i] = '_'; }
|
|
if (!(mask & (size_t(1) << i))) { buf[i] = ' '; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_mask(uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
/* end file generic/stage1/buf_block_reader.h for arm64 */
|
|
/* including generic/stage1/json_escape_scanner.h for arm64: #include <generic/stage1/json_escape_scanner.h> */
|
|
/* begin file generic/stage1/json_escape_scanner.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n).
|
|
*/
|
|
struct json_escape_scanner {
|
|
/** The actual escape characters (the backslashes themselves). */
|
|
uint64_t next_is_escaped = 0ULL;
|
|
|
|
struct escaped_and_escape {
|
|
/**
|
|
* Mask of escaped characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 0100100010100101000
|
|
* n \ \ n \ \
|
|
* ```
|
|
*/
|
|
uint64_t escaped;
|
|
/**
|
|
* Mask of escape characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 1001000101001010001
|
|
* \ \ \ \ \ \ \
|
|
* ```
|
|
*/
|
|
uint64_t escape;
|
|
};
|
|
|
|
/**
|
|
* Get a mask of both escape and escaped characters (the characters following a backslash).
|
|
*
|
|
* @param potential_escape A mask of the character that can escape others (but could be
|
|
* escaped itself). e.g. block.eq('\\')
|
|
*/
|
|
simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept {
|
|
|
|
#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
if (!backslash) { return {next_escaped_without_backslashes(), 0}; }
|
|
#endif
|
|
|
|
// | | Mask (shows characters instead of 1's) | Depth | Instructions |
|
|
// |--------------------------------|----------------------------------------|-------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | |
|
|
// | | ` even odd even odd odd` | | |
|
|
// | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code())
|
|
// | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped))
|
|
// | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash)
|
|
// | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) ()
|
|
// (*) this is not needed until the next iteration
|
|
uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped);
|
|
uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped);
|
|
uint64_t escape = escape_and_terminal_code & backslash;
|
|
this->next_is_escaped = escape >> 63;
|
|
return {escaped, escape};
|
|
}
|
|
|
|
private:
|
|
static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL;
|
|
|
|
simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept {
|
|
uint64_t escaped = this->next_is_escaped;
|
|
this->next_is_escaped = 0;
|
|
return escaped;
|
|
}
|
|
|
|
/**
|
|
* Returns a mask of the next escape characters (masking out escaped backslashes), along with
|
|
* any non-backslash escape codes.
|
|
*
|
|
* \n \\n \\\n \\\\n returns:
|
|
* \n \ \ \n \ \
|
|
* 11 100 1011 10100
|
|
*
|
|
* You are expected to mask out the first bit yourself if the previous block had a trailing
|
|
* escape.
|
|
*
|
|
* & the result with potential_escape to get just the escape characters.
|
|
* ^ the result with (potential_escape | first_is_escaped) to get escaped characters.
|
|
*/
|
|
static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept {
|
|
// If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer:
|
|
// any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be
|
|
// inverted (\\\ would be 010 instead of 101).
|
|
//
|
|
// ```
|
|
// string: | ____\\\\_\\\\_____ |
|
|
// maybe_escaped | ODD | \ \ \ \ |
|
|
// even-aligned ^^^ ^^^^ odd-aligned
|
|
// ```
|
|
//
|
|
// Taking that into account, our basic strategy is:
|
|
//
|
|
// 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for
|
|
// odd-aligned runs.
|
|
// 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the
|
|
// odd bits in odd-aligned runs.
|
|
// 3. & with backslash to clean up any stray bits.
|
|
// runs are set to 0, and then XORing with "odd":
|
|
//
|
|
// | | Mask (shows characters instead of 1's) | Instructions |
|
|
// |--------------------------------|----------------------------------------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` |
|
|
// | | ` even odd even odd odd` |
|
|
// | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1)
|
|
// | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd)
|
|
// | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd)
|
|
//
|
|
|
|
// Escaped characters are characters following an escape.
|
|
uint64_t maybe_escaped = potential_escape << 1;
|
|
|
|
// To distinguish odd from even escape sequences, therefore, we turn on any *starting*
|
|
// escapes that are on an odd byte. (We actually bring in all odd bits, for speed.)
|
|
// - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1.
|
|
// - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0.
|
|
// - All other odd bytes are 1, and even bytes are 0.
|
|
uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS;
|
|
uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape;
|
|
|
|
// Now we flip all odd bytes back with xor. This:
|
|
// - Makes odd runs of backslashes go from 0000 to 1010
|
|
// - Makes even runs of backslashes go from 1111 to 1010
|
|
// - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100)
|
|
// - Resets all other bytes to 0
|
|
return even_series_codes_and_odd_bits ^ ODD_BITS;
|
|
}
|
|
};
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_escape_scanner.h for arm64 */
|
|
/* including generic/stage1/json_string_scanner.h for arm64: #include <generic/stage1/json_string_scanner.h> */
|
|
/* begin file generic/stage1/json_string_scanner.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_escape_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
struct json_string_block {
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) :
|
|
_escaped(escaped), _quote(quote), _in_string(in_string) {}
|
|
|
|
// Escaped characters (characters following an escape() character)
|
|
simdjson_really_inline uint64_t escaped() const { return _escaped; }
|
|
// Real (non-backslashed) quotes
|
|
simdjson_really_inline uint64_t quote() const { return _quote; }
|
|
// Only characters inside the string (not including the quotes)
|
|
simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
|
|
// Tail of string (everything except the start quote)
|
|
simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
|
|
|
|
// escaped characters (backslashed--does not include the hex characters after \u)
|
|
uint64_t _escaped;
|
|
// real quotes (non-escaped ones)
|
|
uint64_t _quote;
|
|
// string characters (includes start quote but not end quote)
|
|
uint64_t _in_string;
|
|
};
|
|
|
|
// Scans blocks for string characters, storing the state necessary to do so
|
|
class json_string_scanner {
|
|
public:
|
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_really_inline error_code finish();
|
|
|
|
private:
|
|
// Scans for escape characters
|
|
json_escape_scanner escape_scanner{};
|
|
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
|
uint64_t prev_in_string = 0ULL;
|
|
};
|
|
|
|
//
|
|
// Return a mask of all string characters plus end quotes.
|
|
//
|
|
// prev_escaped is overflow saying whether the next character is escaped.
|
|
// prev_in_string is overflow saying whether we're still in a string.
|
|
//
|
|
// Backslash sequences outside of quotes will be detected in stage 2.
|
|
//
|
|
simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
const uint64_t backslash = in.eq('\\');
|
|
const uint64_t escaped = escape_scanner.next(backslash).escaped;
|
|
const uint64_t quote = in.eq('"') & ~escaped;
|
|
|
|
//
|
|
// prefix_xor flips on bits inside the string (and flips off the end quote).
|
|
//
|
|
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
|
|
// (characters inside strings are outside, and characters outside strings are inside).
|
|
//
|
|
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
|
|
|
|
//
|
|
// Check if we're still in a string at the end of the box so the next block will know
|
|
//
|
|
prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
|
|
|
|
// Use ^ to turn the beginning quote off, and the end quote on.
|
|
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_string_block(escaped, quote, in_string);
|
|
}
|
|
|
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
|
if (prev_in_string) {
|
|
return UNCLOSED_STRING;
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_string_scanner.h for arm64 */
|
|
/* including generic/stage1/utf8_lookup4_algorithm.h for arm64: #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* begin file generic/stage1/utf8_lookup4_algorithm.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = must_be_2_3_continuation(prev2, prev3);
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
static const uint8_t max_array[64] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#else
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#endif
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdjson_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdjson_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
|
|
||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
|
|
|| (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support one, two or four chunks per 64-byte block.");
|
|
SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdjson_inline error_code errors() {
|
|
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
/* end file generic/stage1/utf8_lookup4_algorithm.h for arm64 */
|
|
/* including generic/stage1/json_scanner.h for arm64: #include <generic/stage1/json_scanner.h> */
|
|
/* begin file generic/stage1/json_scanner.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/json_character_block.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* A block of scanned json, with information on operators and scalars.
|
|
*
|
|
* We seek to identify pseudo-structural characters. Anything that is inside
|
|
* a string must be omitted (hence & ~_string.string_tail()).
|
|
* Otherwise, pseudo-structural characters come in two forms.
|
|
* 1. We have the structural characters ([,],{,},:, comma). The
|
|
* term 'structural character' is from the JSON RFC.
|
|
* 2. We have the 'scalar pseudo-structural characters'.
|
|
* Scalars are quotes, and any character except structural characters and white space.
|
|
*
|
|
* To identify the scalar pseudo-structural characters, we must look at what comes
|
|
* before them: it must be a space, a quote or a structural characters.
|
|
* Starting with simdjson v0.3, we identify them by
|
|
* negation: we identify everything that is followed by a non-quote scalar,
|
|
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
|
*/
|
|
struct json_block {
|
|
public:
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
|
|
/**
|
|
* The start of structurals.
|
|
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
|
**/
|
|
simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
|
|
/** All JSON whitespace (i.e. not in a string) */
|
|
simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
|
|
|
|
// Helpers
|
|
|
|
/** Whether the given characters are inside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
|
|
/** Whether the given characters are outside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
|
|
|
|
// string and escape characters
|
|
json_string_block _string;
|
|
// whitespace, structural characters ('operators'), scalars
|
|
json_character_block _characters;
|
|
// whether the previous character was a scalar
|
|
uint64_t _follows_potential_nonquote_scalar;
|
|
private:
|
|
// Potential structurals (i.e. disregarding strings)
|
|
|
|
/**
|
|
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
|
* They may reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
|
|
/**
|
|
* The start of non-operator runs, like 123, true and "abc".
|
|
* It main reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_scalar_start() const noexcept {
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
|
// then we know that it is irrelevant structurally.
|
|
return _characters.scalar() & ~follows_potential_scalar();
|
|
}
|
|
/**
|
|
* Whether the given character is immediately after a non-operator like 123, true.
|
|
* The characters following a quote are not included.
|
|
*/
|
|
simdjson_inline uint64_t follows_potential_scalar() const noexcept {
|
|
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
|
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
|
// white space.
|
|
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
|
return _follows_potential_nonquote_scalar;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
|
*
|
|
* The scanner starts by calculating two distinct things:
|
|
* - string characters (taking \" into account)
|
|
* - structural characters or 'operators' ([]{},:, comma)
|
|
* and scalars (runs of non-operators like 123, true and "abc")
|
|
*
|
|
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
|
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
|
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
|
|
* part of a string.
|
|
*/
|
|
class json_scanner {
|
|
public:
|
|
json_scanner() = default;
|
|
simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_inline error_code finish();
|
|
|
|
private:
|
|
// Whether the last character of the previous iteration is part of a scalar token
|
|
// (anything except whitespace or a structural character/'operator').
|
|
uint64_t prev_scalar = 0ULL;
|
|
json_string_scanner string_scanner{};
|
|
};
|
|
|
|
|
|
//
|
|
// Check if the current character immediately follows a matching character.
|
|
//
|
|
// For example, this checks for quotes with backslashes in front of them:
|
|
//
|
|
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
|
//
|
|
simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|
const uint64_t result = match << 1 | overflow;
|
|
overflow = match >> 63;
|
|
return result;
|
|
}
|
|
|
|
simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
json_string_block strings = string_scanner.next(in);
|
|
// identifies the white-space and the structural characters
|
|
json_character_block characters = json_character_block::classify(in);
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
|
//
|
|
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
|
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
|
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
|
// may need to add an extra check when parsing strings.
|
|
//
|
|
// Performance: there are many ways to skin this cat.
|
|
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
|
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_block(
|
|
strings,// strings is a function-local object so either it moves or the copy is elided.
|
|
characters,
|
|
follows_nonquote_scalar
|
|
);
|
|
}
|
|
|
|
simdjson_inline error_code json_scanner::finish() {
|
|
return string_scanner.finish();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
/* end file generic/stage1/json_scanner.h for arm64 */
|
|
|
|
// All other declarations
|
|
/* including generic/stage1/find_next_document_index.h for arm64: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for arm64 */
|
|
/* including generic/stage1/json_minifier.h for arm64: #include <generic/stage1/json_minifier.h> */
|
|
/* begin file generic/stage1/json_minifier.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class json_minifier {
|
|
public:
|
|
template<size_t STEP_SIZE>
|
|
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_minifier(uint8_t *_dst)
|
|
: dst{_dst}
|
|
{}
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
|
|
simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
|
|
json_scanner scanner{};
|
|
uint8_t *dst;
|
|
};
|
|
|
|
simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
|
uint64_t mask = block.whitespace();
|
|
dst += in.compress(mask, dst);
|
|
}
|
|
|
|
simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
|
error_code error = scanner.finish();
|
|
if (error) { dst_len = 0; return error; }
|
|
dst_len = dst - dst_start;
|
|
return SUCCESS;
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
simd::simd8x64<uint8_t> in_2(block_buf+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1);
|
|
this->next(in_2, block_2);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(block_buf, block_1);
|
|
reader.advance();
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_minifier minifier(dst);
|
|
|
|
// Index the first n-1 blocks
|
|
while (reader.has_full_block()) {
|
|
minifier.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
|
|
// Index the last (remainder) block, padded with spaces
|
|
uint8_t block[STEP_SIZE];
|
|
size_t remaining_bytes = reader.get_remainder(block);
|
|
if (remaining_bytes > 0) {
|
|
// We do not want to write directly to the output stream. Rather, we write
|
|
// to a local buffer (for safety).
|
|
uint8_t out_block[STEP_SIZE];
|
|
uint8_t * const guarded_dst{minifier.dst};
|
|
minifier.dst = out_block;
|
|
minifier.step<STEP_SIZE>(block, reader);
|
|
size_t to_write = minifier.dst - out_block;
|
|
// In some cases, we could be enticed to consider the padded spaces
|
|
// as part of the string. This is fine as long as we do not write more
|
|
// than we consumed.
|
|
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
|
|
memcpy(guarded_dst, out_block, to_write);
|
|
minifier.dst = guarded_dst + to_write;
|
|
}
|
|
return minifier.finish(dst, dst_len);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
/* end file generic/stage1/json_minifier.h for arm64 */
|
|
/* including generic/stage1/json_structural_indexer.h for arm64: #include <generic/stage1/json_structural_indexer.h> */
|
|
/* begin file generic/stage1/json_structural_indexer.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_minifier.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/find_next_document_index.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer {
|
|
public:
|
|
uint32_t *tail;
|
|
|
|
simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
/**
|
|
* ARM lacks a fast trailing zero instruction, but it has a fast
|
|
* bit reversal instruction and a fast leading zero instruction.
|
|
* Thus it may be profitable to reverse the bits (once) and then
|
|
* to rely on a sequence of instructions that call the leading
|
|
* zero instruction.
|
|
*
|
|
* Performance notes:
|
|
* The chosen routine is not optimal in terms of data dependency
|
|
* since zero_leading_bit might require two instructions. However,
|
|
* it tends to minimize the total number of instructions which is
|
|
* beneficial.
|
|
*/
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
|
|
int lz = leading_zeroes(rev_bits);
|
|
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
|
rev_bits = zero_leading_bit(rev_bits, lz);
|
|
}
|
|
#else
|
|
/**
|
|
* Under recent x64 systems, we often have both a fast trailing zero
|
|
* instruction and a fast 'clear-lower-bit' instruction so the following
|
|
* algorithm can be competitive.
|
|
*/
|
|
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
|
|
this->tail[i] = idx + trailing_zeroes(bits);
|
|
bits = clear_lowest_bit(bits);
|
|
}
|
|
#endif // SIMDJSON_PREFER_REVERSE_BITS
|
|
|
|
template <int START, int N>
|
|
simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
|
|
write_index(idx, bits, START);
|
|
SIMDJSON_IF_CONSTEXPR (N > 1) {
|
|
write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
|
|
}
|
|
return START+N;
|
|
}
|
|
|
|
template <int START, int END, int STEP>
|
|
simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
|
|
write_indexes<START, STEP>(idx, bits);
|
|
SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) {
|
|
if (simdjson_unlikely((START+STEP) < cnt)) {
|
|
write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
|
|
}
|
|
}
|
|
return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
|
|
}
|
|
|
|
// flatten out values in 'bits' assuming that they are are to have values of idx
|
|
// plus their position in the bitvector, and store these indexes at
|
|
// base_ptr[base] incrementing base as we go
|
|
// will potentially store extra values beyond end of valid bits, so base_ptr
|
|
// needs to be large enough to handle this
|
|
//
|
|
// If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it
|
|
// will provide its own version of the code.
|
|
#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits);
|
|
#else
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0)
|
|
return;
|
|
|
|
int cnt = static_cast<int>(count_ones(bits));
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
bits = reverse_bits(bits);
|
|
#endif
|
|
#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
|
|
static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
|
|
#else
|
|
static constexpr const int STEP = 4;
|
|
#endif
|
|
static constexpr const int STEP_UNTIL = 24;
|
|
|
|
write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
|
|
SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
|
|
if (simdjson_unlikely(STEP_UNTIL < cnt)) {
|
|
for (int i=STEP_UNTIL; i<cnt; i++) {
|
|
write_index(idx, bits, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
this->tail += cnt;
|
|
}
|
|
#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
};
|
|
|
|
class json_structural_indexer {
|
|
public:
|
|
/**
|
|
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
|
|
*
|
|
* @param partial Setting the partial parameter to true allows the find_structural_bits to
|
|
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
|
|
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
|
*/
|
|
template<size_t STEP_SIZE>
|
|
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
|
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
|
|
|
json_scanner scanner{};
|
|
utf8_checker checker{};
|
|
bit_indexer indexer;
|
|
uint64_t prev_structurals = 0;
|
|
uint64_t unescaped_chars_error = 0;
|
|
};
|
|
|
|
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
|
|
|
|
// Skip the last character if it is partial
|
|
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
|
|
if (simdjson_unlikely(len < 3)) {
|
|
switch (len) {
|
|
case 2:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
|
|
return len;
|
|
case 1:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return len;
|
|
case 0:
|
|
return len;
|
|
}
|
|
}
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// PERF NOTES:
|
|
// We pipe 2 inputs through these stages:
|
|
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
|
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
|
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
|
|
// The output of step 1 depends entirely on this information. These functions don't quite use
|
|
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
|
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
|
// they can make a lot of progress before they need that information.
|
|
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
|
|
// to finish: utf-8 checks and generating the output from the last iteration.
|
|
//
|
|
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
|
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
|
// workout.
|
|
//
|
|
template<size_t STEP_SIZE>
|
|
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
|
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
|
// We guard the rest of the code so that we can assume that len > 0 throughout.
|
|
if (len == 0) { return EMPTY; }
|
|
if (is_streaming(partial)) {
|
|
len = trim_partial_utf8(buf, len);
|
|
// If you end up with an empty window after trimming
|
|
// the partial UTF-8 bytes, then chances are good that you
|
|
// have an UTF-8 formatting error.
|
|
if(len == 0) { return UTF8_ERROR; }
|
|
}
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_structural_indexer indexer(parser.structural_indexes.get());
|
|
|
|
// Read all but the last block
|
|
while (reader.has_full_block()) {
|
|
indexer.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
// Take care of the last block (will always be there unless file is empty which is
|
|
// not supposed to happen.)
|
|
uint8_t block[STEP_SIZE];
|
|
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
|
|
indexer.step<STEP_SIZE>(block, reader);
|
|
return indexer.finish(parser, reader.block_index(), len, partial);
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
simd::simd8x64<uint8_t> in_2(block+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
this->next(in_2, block_2, reader.block_index()+64);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
reader.advance();
|
|
}
|
|
|
|
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
|
|
uint64_t unescaped = in.lteq(0x1F);
|
|
#if SIMDJSON_UTF8VALIDATION
|
|
checker.check_next_input(in);
|
|
#endif
|
|
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
|
|
prev_structurals = block.structural_start();
|
|
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
|
}
|
|
|
|
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
|
// Write out the final iteration's structurals
|
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
|
error_code error = scanner.finish();
|
|
// We deliberately break down the next expression so that it is
|
|
// human readable.
|
|
const bool should_we_exit = is_streaming(partial) ?
|
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
|
|
|
if (unescaped_chars_error) {
|
|
return UNESCAPED_CHARS;
|
|
}
|
|
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
|
/***
|
|
* The On Demand API requires special padding.
|
|
*
|
|
* This is related to https://github.com/simdjson/simdjson/issues/906
|
|
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
|
* structural character, it quickly stops.
|
|
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
|
|
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
|
|
* continues, then it must be [,] or }.
|
|
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
|
|
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
|
|
* It can't be a comma, a colon or any simple value. So the only way we could continue is
|
|
* if the repeated character is [. But if so, the document must start with [. But if the document
|
|
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
|
* ][[ which is invalid.
|
|
*
|
|
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
|
* R"({ "a": [,,)"
|
|
**/
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
|
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
|
parser.next_structural_index = 0;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
return EMPTY;
|
|
}
|
|
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
|
return UNEXPECTED_ERROR;
|
|
}
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
// If we have an unclosed string, then the last structural
|
|
// will be the quote and we want to make sure to omit it.
|
|
if(have_unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if (partial == stage1_mode::streaming_final) {
|
|
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
|
// often load their data in bulk without being careful and they want us to ignore
|
|
// the trailing garbage.
|
|
return EMPTY;
|
|
}
|
|
}
|
|
checker.check_eof();
|
|
return checker.errors();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
/* end file generic/stage1/json_structural_indexer.h for arm64 */
|
|
/* including generic/stage1/utf8_validator.h for arm64: #include <generic/stage1/utf8_validator.h> */
|
|
/* begin file generic/stage1/utf8_validator.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return c.errors() == error_code::SUCCESS;
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
/* end file generic/stage1/utf8_validator.h for arm64 */
|
|
/* end file generic/stage1/amalgamated.h for arm64 */
|
|
/* including generic/stage2/amalgamated.h for arm64: #include <generic/stage2/amalgamated.h> */
|
|
/* begin file generic/stage2/amalgamated.h for arm64 */
|
|
// Stuff other things depend on
|
|
/* including generic/stage2/base.h for arm64: #include <generic/stage2/base.h> */
|
|
/* begin file generic/stage2/base.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator;
|
|
class structural_iterator;
|
|
struct tape_builder;
|
|
struct tape_writer;
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
/* end file generic/stage2/base.h for arm64 */
|
|
/* including generic/stage2/tape_writer.h for arm64: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for arm64 */
|
|
/* including generic/stage2/logger.h for arm64: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for arm64 */
|
|
|
|
// All other declarations
|
|
/* including generic/stage2/json_iterator.h for arm64: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for arm64 */
|
|
/* including generic/stage2/stringparsing.h for arm64: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for arm64 */
|
|
/* including generic/stage2/structural_iterator.h for arm64: #include <generic/stage2/structural_iterator.h> */
|
|
/* begin file generic/stage2/structural_iterator.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class structural_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
|
|
// Start a structural
|
|
simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
// Get the buffer position of the current structural character
|
|
simdjson_inline const uint8_t* current() {
|
|
return &buf[*(next_structural-1)];
|
|
}
|
|
// Get the current structural character
|
|
simdjson_inline char current_char() {
|
|
return buf[*(next_structural-1)];
|
|
}
|
|
// Get the next structural character without advancing
|
|
simdjson_inline char peek_next_char() {
|
|
return buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* peek() {
|
|
return &buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* advance() {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline char advance_char() {
|
|
return buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t remaining_len() {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool at_end() {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool at_beginning() {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
};
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
/* end file generic/stage2/structural_iterator.h for arm64 */
|
|
/* including generic/stage2/tape_builder.h for arm64: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for arm64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for arm64 */
|
|
/* end file generic/stage2/amalgamated.h for arm64 */
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
|
// Functional programming causes trouble with Visual Studio.
|
|
// Keeping this version in comments since it is much nicer:
|
|
// auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
|
|
// auto nib_lo = chunk & 0xf;
|
|
// auto nib_hi = chunk.shr<4>();
|
|
// auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
|
|
// auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
|
|
// return shuf_lo & shuf_hi;
|
|
// });
|
|
const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
|
|
const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
|
|
|
|
simd8x64<uint8_t> v(
|
|
(in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
|
|
(in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
|
|
(in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
|
|
(in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
|
|
);
|
|
|
|
|
|
// We compute whitespace and op separately. If the code later only use one or the
|
|
// other, given the fact that all functions are aggressively inlined, we can
|
|
// hope that useless computations will be omitted. This is namely case when
|
|
// minifying (we only need whitespace). *However* if we only need spaces,
|
|
// it is likely that we will still compute 'v' above with two lookup_16: one
|
|
// could do it a bit cheaper. This is in contrast with the x64 implementations
|
|
// where we can, efficiently, do the white space and structural matching
|
|
// separately. One reason for this difference is that on ARM NEON, the table
|
|
// lookups either zero or leave unchanged the characters exceeding 0xF whereas
|
|
// on x64, the equivalent instruction (pshufb) automatically applies a mask,
|
|
// ignoring the 4 most significant bits. Thus the x64 implementation is
|
|
// optimized differently. This being said, if you use this code strictly
|
|
// just for minification (or just to identify the structural characters),
|
|
// there is a small untaken optimization opportunity here. We deliberately
|
|
// do not pick it up.
|
|
|
|
uint64_t op = simd8x64<bool>(
|
|
v.chunks[0].any_bits_set(0x7),
|
|
v.chunks[1].any_bits_set(0x7),
|
|
v.chunks[2].any_bits_set(0x7),
|
|
v.chunks[3].any_bits_set(0x7)
|
|
).to_bitmask();
|
|
|
|
uint64_t whitespace = simd8x64<bool>(
|
|
v.chunks[0].any_bits_set(0x18),
|
|
v.chunks[1].any_bits_set(0x18),
|
|
v.chunks[2].any_bits_set(0x18),
|
|
v.chunks[3].any_bits_set(0x18)
|
|
).to_bitmask();
|
|
|
|
return { whitespace, op };
|
|
}
|
|
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
simd8<uint8_t> bits = input.reduce_or();
|
|
return bits.max_val() < 0x80u;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<bool> is_second_byte = prev1 >= uint8_t(0xc0u);
|
|
simd8<bool> is_third_byte = prev2 >= uint8_t(0xe0u);
|
|
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0xf0u);
|
|
// Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
|
|
// This will work fine because we only have to report errors for cases with 0-1 lead bytes.
|
|
// Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
|
|
// guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
|
|
// The error will be detected there.
|
|
return is_second_byte ^ is_third_byte ^ is_fourth_byte;
|
|
}
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
|
|
return is_third_byte | is_fourth_byte;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdjson {
|
|
namespace arm64 {
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
|
|
}
|
|
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return arm64::stage1::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept {
|
|
return arm64::stringparsing::parse_string(src, dst, allow_replacement);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return arm64::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace arm64
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/arm64/end.h: #include <simdjson/arm64/end.h> */
|
|
/* begin file simdjson/arm64/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "arm64" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/arm64/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_ARM64_CPP
|
|
/* end file arm64.cpp */
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_HASWELL
|
|
/* including haswell.cpp: #include <haswell.cpp> */
|
|
/* begin file haswell.cpp */
|
|
#ifndef SIMDJSON_SRC_HASWELL_CPP
|
|
#define SIMDJSON_SRC_HASWELL_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/haswell.h: #include <simdjson/haswell.h> */
|
|
/* begin file simdjson/haswell.h */
|
|
#ifndef SIMDJSON_HASWELL_H
|
|
#define SIMDJSON_HASWELL_H
|
|
|
|
/* including simdjson/haswell/begin.h: #include "simdjson/haswell/begin.h" */
|
|
/* begin file simdjson/haswell/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "haswell" */
|
|
#define SIMDJSON_IMPLEMENTATION haswell
|
|
|
|
/* including simdjson/haswell/base.h: #include "simdjson/haswell/base.h" */
|
|
/* begin file simdjson/haswell/base.h */
|
|
#ifndef SIMDJSON_HASWELL_BASE_H
|
|
#define SIMDJSON_HASWELL_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_HASWELL
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Haswell (Intel AVX2).
|
|
*/
|
|
namespace haswell {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_BASE_H
|
|
/* end file simdjson/haswell/base.h */
|
|
/* including simdjson/haswell/intrinsics.h: #include "simdjson/haswell/intrinsics.h" */
|
|
/* begin file simdjson/haswell/intrinsics.h */
|
|
#ifndef SIMDJSON_HASWELL_INTRINSICS_H
|
|
#define SIMDJSON_HASWELL_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdjson, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) ((n - 1) & n)
|
|
#endif // _blsr_u64
|
|
#endif // SIMDJSON_CLANG_VISUAL_STUDIO
|
|
|
|
static_assert(sizeof(__m256i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for haswell kernel.");
|
|
|
|
#endif // SIMDJSON_HASWELL_INTRINSICS_H
|
|
/* end file simdjson/haswell/intrinsics.h */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_HASWELL
|
|
SIMDJSON_TARGET_REGION("avx2,bmi,pclmul,lzcnt,popcnt")
|
|
#endif
|
|
|
|
/* including simdjson/haswell/bitmanipulation.h: #include "simdjson/haswell/bitmanipulation.h" */
|
|
/* begin file simdjson/haswell/bitmanipulation.h */
|
|
#ifndef SIMDJSON_HASWELL_BITMANIPULATION_H
|
|
#define SIMDJSON_HASWELL_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/bitmask.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return (int)_tzcnt_u64(input_num);
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
////////
|
|
// You might expect the next line to be equivalent to
|
|
// return (int)_tzcnt_u64(input_num);
|
|
// but the generated code differs and might be less efficient?
|
|
////////
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return _blsr_u64(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return int(_lzcnt_u64(input_num));
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_BITMANIPULATION_H
|
|
/* end file simdjson/haswell/bitmanipulation.h */
|
|
/* including simdjson/haswell/bitmask.h: #include "simdjson/haswell/bitmask.h" */
|
|
/* begin file simdjson/haswell/bitmask.h */
|
|
#ifndef SIMDJSON_HASWELL_BITMASK_H
|
|
#define SIMDJSON_HASWELL_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
|
|
// There should be no such thing with a processor supporting avx2
|
|
// but not clmul.
|
|
__m128i all_ones = _mm_set1_epi8('\xFF');
|
|
__m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
|
|
return _mm_cvtsi128_si64(result);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_BITMASK_H
|
|
/* end file simdjson/haswell/bitmask.h */
|
|
/* including simdjson/haswell/numberparsing_defs.h: #include "simdjson/haswell/numberparsing_defs.h" */
|
|
/* begin file simdjson/haswell/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace numberparsing {
|
|
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
// this actually computes *16* values so we are being wasteful.
|
|
const __m128i ascii0 = _mm_set1_epi8('0');
|
|
const __m128i mul_1_10 =
|
|
_mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
|
|
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
|
|
const __m128i mul_1_10000 =
|
|
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
|
|
const __m128i input = _mm_sub_epi8(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
|
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
|
|
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
|
|
const __m128i t3 = _mm_packus_epi32(t2, t2);
|
|
const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
|
|
return _mm_cvtsi128_si32(
|
|
t4); // only captures the sum of the first 8 digits, drop the rest
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/haswell/numberparsing_defs.h */
|
|
/* including simdjson/haswell/simd.h: #include "simdjson/haswell/simd.h" */
|
|
/* begin file simdjson/haswell/simd.h */
|
|
#ifndef SIMDJSON_HASWELL_SIMD_H
|
|
#define SIMDJSON_HASWELL_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m256i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m256i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m256i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m256i&() const { return this->value; }
|
|
simdjson_inline operator __m256i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint32_t bitmask_t;
|
|
typedef uint64_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<T>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); }
|
|
simdjson_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm256_setzero_si256(); }
|
|
static simdjson_inline simd8<T> load(const T values[32]) {
|
|
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm256_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint32_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in four steps, first 8 bytes and then second 8 bytes...
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits
|
|
uint8_t mask3 = uint8_t(mask >> 16); // ...
|
|
uint8_t mask4 = uint8_t(mask >> 24); // ...
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
__m256i shufmask = _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3],
|
|
thintable_epi8[mask2], thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask and so forth
|
|
shufmask =
|
|
_mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818,
|
|
0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
|
|
// we still need to put the pieces back together.
|
|
// we compute the popcount of the first words:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
int pop3 = BitsSetTable256mul2[mask3];
|
|
|
|
// then load the corresponding mask
|
|
// could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
|
|
__m256i v256 = _mm256_castsi128_si256(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8)));
|
|
__m256i compactmask = _mm256_insertf128_si256(v256,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop3 * 8)), 1);
|
|
__m256i almostthere = _mm256_shuffle_epi8(pruned, compactmask);
|
|
// We just need to write out the result.
|
|
// This is the tricky bit that is hard to do
|
|
// if we want to return a SIMD register, since there
|
|
// is no single-instruction approach to recombine
|
|
// the two 128-bit lanes with an offset.
|
|
__m128i v128;
|
|
v128 = _mm256_castsi256_si128(almostthere);
|
|
_mm_storeu_si128( reinterpret_cast<__m128i *>(output), v128);
|
|
v128 = _mm256_extractf128_si256(almostthere, 1);
|
|
_mm_storeu_si128( reinterpret_cast<__m128i *>(output + 16 - count_ones(mask & 0xFFFF)), v128);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
|
|
int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
|
|
int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
|
|
) : simd8(_mm256_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
|
|
uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
|
|
) : simd8(_mm256_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint32_t mask1 = uint32_t(mask);
|
|
uint32_t mask2 = uint32_t(mask >> 32);
|
|
this->chunks[0].compress(mask1, output);
|
|
this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r_hi = this->chunks[1].to_bitmask();
|
|
return r_lo | (r_hi << 32);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdjson_inline simd8x64<T> bit_or(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<T>(
|
|
this->chunks[0] | mask,
|
|
this->chunks[1] | mask
|
|
);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_SIMD_H
|
|
/* end file simdjson/haswell/simd.h */
|
|
/* including simdjson/haswell/stringparsing_defs.h: #include "simdjson/haswell/stringparsing_defs.h" */
|
|
/* begin file simdjson/haswell/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_HASWELL_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_HASWELL_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 15 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v(src);
|
|
// store to dest unconditionally - we can overwrite the bits we don't like later
|
|
v.store(dst);
|
|
return {
|
|
static_cast<uint32_t>((v == '\\').to_bitmask()), // bs_bits
|
|
static_cast<uint32_t>((v == '"').to_bitmask()), // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/haswell/stringparsing_defs.h */
|
|
/* end file simdjson/haswell/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for haswell: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for haswell */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for haswell: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for haswell */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for haswell */
|
|
/* including simdjson/generic/jsoncharutils.h for haswell: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for haswell */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for haswell */
|
|
/* including simdjson/generic/atomparsing.h for haswell: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for haswell */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for haswell */
|
|
/* including simdjson/generic/dom_parser_implementation.h for haswell: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for haswell */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for haswell */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for haswell: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for haswell */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for haswell */
|
|
/* including simdjson/generic/numberparsing.h for haswell: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for haswell */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for haswell */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for haswell: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for haswell */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for haswell */
|
|
/* end file simdjson/generic/amalgamated.h for haswell */
|
|
/* including simdjson/haswell/end.h: #include "simdjson/haswell/end.h" */
|
|
/* begin file simdjson/haswell/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_HASWELL
|
|
SIMDJSON_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "haswell" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/haswell/end.h */
|
|
|
|
#endif // SIMDJSON_HASWELL_H
|
|
/* end file simdjson/haswell.h */
|
|
/* including simdjson/haswell/implementation.h: #include <simdjson/haswell/implementation.h> */
|
|
/* begin file simdjson/haswell/implementation.h */
|
|
#ifndef SIMDJSON_HASWELL_IMPLEMENTATION_H
|
|
#define SIMDJSON_HASWELL_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_HASWELL
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation(
|
|
"haswell",
|
|
"Intel/AMD AVX2",
|
|
internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
|
|
) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_IMPLEMENTATION_H
|
|
/* end file simdjson/haswell/implementation.h */
|
|
|
|
/* including simdjson/haswell/begin.h: #include <simdjson/haswell/begin.h> */
|
|
/* begin file simdjson/haswell/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "haswell" */
|
|
#define SIMDJSON_IMPLEMENTATION haswell
|
|
|
|
/* including simdjson/haswell/base.h: #include "simdjson/haswell/base.h" */
|
|
/* begin file simdjson/haswell/base.h */
|
|
#ifndef SIMDJSON_HASWELL_BASE_H
|
|
#define SIMDJSON_HASWELL_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_HASWELL
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Haswell (Intel AVX2).
|
|
*/
|
|
namespace haswell {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_BASE_H
|
|
/* end file simdjson/haswell/base.h */
|
|
/* including simdjson/haswell/intrinsics.h: #include "simdjson/haswell/intrinsics.h" */
|
|
/* begin file simdjson/haswell/intrinsics.h */
|
|
#ifndef SIMDJSON_HASWELL_INTRINSICS_H
|
|
#define SIMDJSON_HASWELL_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdjson, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) ((n - 1) & n)
|
|
#endif // _blsr_u64
|
|
#endif // SIMDJSON_CLANG_VISUAL_STUDIO
|
|
|
|
static_assert(sizeof(__m256i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for haswell kernel.");
|
|
|
|
#endif // SIMDJSON_HASWELL_INTRINSICS_H
|
|
/* end file simdjson/haswell/intrinsics.h */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_HASWELL
|
|
SIMDJSON_TARGET_REGION("avx2,bmi,pclmul,lzcnt,popcnt")
|
|
#endif
|
|
|
|
/* including simdjson/haswell/bitmanipulation.h: #include "simdjson/haswell/bitmanipulation.h" */
|
|
/* begin file simdjson/haswell/bitmanipulation.h */
|
|
#ifndef SIMDJSON_HASWELL_BITMANIPULATION_H
|
|
#define SIMDJSON_HASWELL_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/bitmask.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return (int)_tzcnt_u64(input_num);
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
////////
|
|
// You might expect the next line to be equivalent to
|
|
// return (int)_tzcnt_u64(input_num);
|
|
// but the generated code differs and might be less efficient?
|
|
////////
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return _blsr_u64(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return int(_lzcnt_u64(input_num));
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_BITMANIPULATION_H
|
|
/* end file simdjson/haswell/bitmanipulation.h */
|
|
/* including simdjson/haswell/bitmask.h: #include "simdjson/haswell/bitmask.h" */
|
|
/* begin file simdjson/haswell/bitmask.h */
|
|
#ifndef SIMDJSON_HASWELL_BITMASK_H
|
|
#define SIMDJSON_HASWELL_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
|
|
// There should be no such thing with a processor supporting avx2
|
|
// but not clmul.
|
|
__m128i all_ones = _mm_set1_epi8('\xFF');
|
|
__m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
|
|
return _mm_cvtsi128_si64(result);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_BITMASK_H
|
|
/* end file simdjson/haswell/bitmask.h */
|
|
/* including simdjson/haswell/numberparsing_defs.h: #include "simdjson/haswell/numberparsing_defs.h" */
|
|
/* begin file simdjson/haswell/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace numberparsing {
|
|
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
// this actually computes *16* values so we are being wasteful.
|
|
const __m128i ascii0 = _mm_set1_epi8('0');
|
|
const __m128i mul_1_10 =
|
|
_mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
|
|
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
|
|
const __m128i mul_1_10000 =
|
|
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
|
|
const __m128i input = _mm_sub_epi8(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
|
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
|
|
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
|
|
const __m128i t3 = _mm_packus_epi32(t2, t2);
|
|
const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
|
|
return _mm_cvtsi128_si32(
|
|
t4); // only captures the sum of the first 8 digits, drop the rest
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_HASWELL_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/haswell/numberparsing_defs.h */
|
|
/* including simdjson/haswell/simd.h: #include "simdjson/haswell/simd.h" */
|
|
/* begin file simdjson/haswell/simd.h */
|
|
#ifndef SIMDJSON_HASWELL_SIMD_H
|
|
#define SIMDJSON_HASWELL_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m256i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m256i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m256i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m256i&() const { return this->value; }
|
|
simdjson_inline operator __m256i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint32_t bitmask_t;
|
|
typedef uint64_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<T>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return _mm256_movemask_epi8(*this); }
|
|
simdjson_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm256_setzero_si256(); }
|
|
static simdjson_inline simd8<T> load(const T values[32]) {
|
|
return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm256_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint32_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in four steps, first 8 bytes and then second 8 bytes...
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits
|
|
uint8_t mask3 = uint8_t(mask >> 16); // ...
|
|
uint8_t mask4 = uint8_t(mask >> 24); // ...
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
__m256i shufmask = _mm256_set_epi64x(thintable_epi8[mask4], thintable_epi8[mask3],
|
|
thintable_epi8[mask2], thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask and so forth
|
|
shufmask =
|
|
_mm256_add_epi8(shufmask, _mm256_set_epi32(0x18181818, 0x18181818,
|
|
0x10101010, 0x10101010, 0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m256i pruned = _mm256_shuffle_epi8(*this, shufmask);
|
|
// we still need to put the pieces back together.
|
|
// we compute the popcount of the first words:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
int pop3 = BitsSetTable256mul2[mask3];
|
|
|
|
// then load the corresponding mask
|
|
// could be done with _mm256_loadu2_m128i but many standard libraries omit this intrinsic.
|
|
__m256i v256 = _mm256_castsi128_si256(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8)));
|
|
__m256i compactmask = _mm256_insertf128_si256(v256,
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop3 * 8)), 1);
|
|
__m256i almostthere = _mm256_shuffle_epi8(pruned, compactmask);
|
|
// We just need to write out the result.
|
|
// This is the tricky bit that is hard to do
|
|
// if we want to return a SIMD register, since there
|
|
// is no single-instruction approach to recombine
|
|
// the two 128-bit lanes with an offset.
|
|
__m128i v128;
|
|
v128 = _mm256_castsi256_si128(almostthere);
|
|
_mm_storeu_si128( reinterpret_cast<__m128i *>(output), v128);
|
|
v128 = _mm256_extractf128_si256(almostthere, 1);
|
|
_mm_storeu_si128( reinterpret_cast<__m128i *>(output + 16 - count_ones(mask & 0xFFFF)), v128);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
|
|
int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
|
|
int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
|
|
) : simd8(_mm256_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm256_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
|
|
uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
|
|
) : simd8(_mm256_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm256_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm256_min_epu8(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm256_testz_si256(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm256_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm256_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint32_t mask1 = uint32_t(mask);
|
|
uint32_t mask2 = uint32_t(mask >> 32);
|
|
this->chunks[0].compress(mask1, output);
|
|
this->chunks[1].compress(mask2, output + 32 - count_ones(mask1));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r_hi = this->chunks[1].to_bitmask();
|
|
return r_lo | (r_hi << 32);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdjson_inline simd8x64<T> bit_or(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<T>(
|
|
this->chunks[0] | mask,
|
|
this->chunks[1] | mask
|
|
);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_SIMD_H
|
|
/* end file simdjson/haswell/simd.h */
|
|
/* including simdjson/haswell/stringparsing_defs.h: #include "simdjson/haswell/stringparsing_defs.h" */
|
|
/* begin file simdjson/haswell/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_HASWELL_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_HASWELL_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 15 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v(src);
|
|
// store to dest unconditionally - we can overwrite the bits we don't like later
|
|
v.store(dst);
|
|
return {
|
|
static_cast<uint32_t>((v == '\\').to_bitmask()), // bs_bits
|
|
static_cast<uint32_t>((v == '"').to_bitmask()), // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_HASWELL_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/haswell/stringparsing_defs.h */
|
|
/* end file simdjson/haswell/begin.h */
|
|
/* including generic/amalgamated.h for haswell: #include <generic/amalgamated.h> */
|
|
/* begin file generic/amalgamated.h for haswell */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H)
|
|
#error generic/dependencies.h must be included before generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including generic/base.h for haswell: #include <generic/base.h> */
|
|
/* begin file generic/base.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
struct json_character_block;
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_BASE_H
|
|
/* end file generic/base.h for haswell */
|
|
/* including generic/dom_parser_implementation.h for haswell: #include <generic/dom_parser_implementation.h> */
|
|
/* begin file generic/dom_parser_implementation.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// Interface a dom parser implementation must fulfill
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3);
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input);
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file generic/dom_parser_implementation.h for haswell */
|
|
/* including generic/json_character_block.h for haswell: #include <generic/json_character_block.h> */
|
|
/* begin file generic/json_character_block.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
|
|
struct json_character_block {
|
|
static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
|
|
|
simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
|
|
simdjson_inline uint64_t op() const noexcept { return _op; }
|
|
simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
|
|
|
|
uint64_t _whitespace;
|
|
uint64_t _op;
|
|
};
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
/* end file generic/json_character_block.h for haswell */
|
|
/* end file generic/amalgamated.h for haswell */
|
|
/* including generic/stage1/amalgamated.h for haswell: #include <generic/stage1/amalgamated.h> */
|
|
/* begin file generic/stage1/amalgamated.h for haswell */
|
|
// Stuff other things depend on
|
|
/* including generic/stage1/base.h for haswell: #include <generic/stage1/base.h> */
|
|
/* begin file generic/stage1/base.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer;
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader;
|
|
struct json_block;
|
|
class json_minifier;
|
|
class json_scanner;
|
|
struct json_string_block;
|
|
class json_string_scanner;
|
|
class json_structural_indexer;
|
|
|
|
} // namespace stage1
|
|
|
|
namespace utf8_validation {
|
|
struct utf8_checker;
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
/* end file generic/stage1/base.h for haswell */
|
|
/* including generic/stage1/buf_block_reader.h for haswell: #include <generic/stage1/buf_block_reader.h> */
|
|
/* begin file generic/stage1/buf_block_reader.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdjson_inline size_t block_index();
|
|
simdjson_inline bool has_full_block() const;
|
|
simdjson_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdjson_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdjson_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in, uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] <= ' ') { buf[i] = '_'; }
|
|
if (!(mask & (size_t(1) << i))) { buf[i] = ' '; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_mask(uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
/* end file generic/stage1/buf_block_reader.h for haswell */
|
|
/* including generic/stage1/json_escape_scanner.h for haswell: #include <generic/stage1/json_escape_scanner.h> */
|
|
/* begin file generic/stage1/json_escape_scanner.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n).
|
|
*/
|
|
struct json_escape_scanner {
|
|
/** The actual escape characters (the backslashes themselves). */
|
|
uint64_t next_is_escaped = 0ULL;
|
|
|
|
struct escaped_and_escape {
|
|
/**
|
|
* Mask of escaped characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 0100100010100101000
|
|
* n \ \ n \ \
|
|
* ```
|
|
*/
|
|
uint64_t escaped;
|
|
/**
|
|
* Mask of escape characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 1001000101001010001
|
|
* \ \ \ \ \ \ \
|
|
* ```
|
|
*/
|
|
uint64_t escape;
|
|
};
|
|
|
|
/**
|
|
* Get a mask of both escape and escaped characters (the characters following a backslash).
|
|
*
|
|
* @param potential_escape A mask of the character that can escape others (but could be
|
|
* escaped itself). e.g. block.eq('\\')
|
|
*/
|
|
simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept {
|
|
|
|
#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
if (!backslash) { return {next_escaped_without_backslashes(), 0}; }
|
|
#endif
|
|
|
|
// | | Mask (shows characters instead of 1's) | Depth | Instructions |
|
|
// |--------------------------------|----------------------------------------|-------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | |
|
|
// | | ` even odd even odd odd` | | |
|
|
// | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code())
|
|
// | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped))
|
|
// | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash)
|
|
// | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) ()
|
|
// (*) this is not needed until the next iteration
|
|
uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped);
|
|
uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped);
|
|
uint64_t escape = escape_and_terminal_code & backslash;
|
|
this->next_is_escaped = escape >> 63;
|
|
return {escaped, escape};
|
|
}
|
|
|
|
private:
|
|
static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL;
|
|
|
|
simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept {
|
|
uint64_t escaped = this->next_is_escaped;
|
|
this->next_is_escaped = 0;
|
|
return escaped;
|
|
}
|
|
|
|
/**
|
|
* Returns a mask of the next escape characters (masking out escaped backslashes), along with
|
|
* any non-backslash escape codes.
|
|
*
|
|
* \n \\n \\\n \\\\n returns:
|
|
* \n \ \ \n \ \
|
|
* 11 100 1011 10100
|
|
*
|
|
* You are expected to mask out the first bit yourself if the previous block had a trailing
|
|
* escape.
|
|
*
|
|
* & the result with potential_escape to get just the escape characters.
|
|
* ^ the result with (potential_escape | first_is_escaped) to get escaped characters.
|
|
*/
|
|
static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept {
|
|
// If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer:
|
|
// any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be
|
|
// inverted (\\\ would be 010 instead of 101).
|
|
//
|
|
// ```
|
|
// string: | ____\\\\_\\\\_____ |
|
|
// maybe_escaped | ODD | \ \ \ \ |
|
|
// even-aligned ^^^ ^^^^ odd-aligned
|
|
// ```
|
|
//
|
|
// Taking that into account, our basic strategy is:
|
|
//
|
|
// 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for
|
|
// odd-aligned runs.
|
|
// 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the
|
|
// odd bits in odd-aligned runs.
|
|
// 3. & with backslash to clean up any stray bits.
|
|
// runs are set to 0, and then XORing with "odd":
|
|
//
|
|
// | | Mask (shows characters instead of 1's) | Instructions |
|
|
// |--------------------------------|----------------------------------------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` |
|
|
// | | ` even odd even odd odd` |
|
|
// | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1)
|
|
// | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd)
|
|
// | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd)
|
|
//
|
|
|
|
// Escaped characters are characters following an escape.
|
|
uint64_t maybe_escaped = potential_escape << 1;
|
|
|
|
// To distinguish odd from even escape sequences, therefore, we turn on any *starting*
|
|
// escapes that are on an odd byte. (We actually bring in all odd bits, for speed.)
|
|
// - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1.
|
|
// - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0.
|
|
// - All other odd bytes are 1, and even bytes are 0.
|
|
uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS;
|
|
uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape;
|
|
|
|
// Now we flip all odd bytes back with xor. This:
|
|
// - Makes odd runs of backslashes go from 0000 to 1010
|
|
// - Makes even runs of backslashes go from 1111 to 1010
|
|
// - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100)
|
|
// - Resets all other bytes to 0
|
|
return even_series_codes_and_odd_bits ^ ODD_BITS;
|
|
}
|
|
};
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_escape_scanner.h for haswell */
|
|
/* including generic/stage1/json_string_scanner.h for haswell: #include <generic/stage1/json_string_scanner.h> */
|
|
/* begin file generic/stage1/json_string_scanner.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_escape_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
struct json_string_block {
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) :
|
|
_escaped(escaped), _quote(quote), _in_string(in_string) {}
|
|
|
|
// Escaped characters (characters following an escape() character)
|
|
simdjson_really_inline uint64_t escaped() const { return _escaped; }
|
|
// Real (non-backslashed) quotes
|
|
simdjson_really_inline uint64_t quote() const { return _quote; }
|
|
// Only characters inside the string (not including the quotes)
|
|
simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
|
|
// Tail of string (everything except the start quote)
|
|
simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
|
|
|
|
// escaped characters (backslashed--does not include the hex characters after \u)
|
|
uint64_t _escaped;
|
|
// real quotes (non-escaped ones)
|
|
uint64_t _quote;
|
|
// string characters (includes start quote but not end quote)
|
|
uint64_t _in_string;
|
|
};
|
|
|
|
// Scans blocks for string characters, storing the state necessary to do so
|
|
class json_string_scanner {
|
|
public:
|
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_really_inline error_code finish();
|
|
|
|
private:
|
|
// Scans for escape characters
|
|
json_escape_scanner escape_scanner{};
|
|
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
|
uint64_t prev_in_string = 0ULL;
|
|
};
|
|
|
|
//
|
|
// Return a mask of all string characters plus end quotes.
|
|
//
|
|
// prev_escaped is overflow saying whether the next character is escaped.
|
|
// prev_in_string is overflow saying whether we're still in a string.
|
|
//
|
|
// Backslash sequences outside of quotes will be detected in stage 2.
|
|
//
|
|
simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
const uint64_t backslash = in.eq('\\');
|
|
const uint64_t escaped = escape_scanner.next(backslash).escaped;
|
|
const uint64_t quote = in.eq('"') & ~escaped;
|
|
|
|
//
|
|
// prefix_xor flips on bits inside the string (and flips off the end quote).
|
|
//
|
|
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
|
|
// (characters inside strings are outside, and characters outside strings are inside).
|
|
//
|
|
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
|
|
|
|
//
|
|
// Check if we're still in a string at the end of the box so the next block will know
|
|
//
|
|
prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
|
|
|
|
// Use ^ to turn the beginning quote off, and the end quote on.
|
|
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_string_block(escaped, quote, in_string);
|
|
}
|
|
|
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
|
if (prev_in_string) {
|
|
return UNCLOSED_STRING;
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_string_scanner.h for haswell */
|
|
/* including generic/stage1/utf8_lookup4_algorithm.h for haswell: #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* begin file generic/stage1/utf8_lookup4_algorithm.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = must_be_2_3_continuation(prev2, prev3);
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
static const uint8_t max_array[64] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#else
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#endif
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdjson_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdjson_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
|
|
||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
|
|
|| (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support one, two or four chunks per 64-byte block.");
|
|
SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdjson_inline error_code errors() {
|
|
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
/* end file generic/stage1/utf8_lookup4_algorithm.h for haswell */
|
|
/* including generic/stage1/json_scanner.h for haswell: #include <generic/stage1/json_scanner.h> */
|
|
/* begin file generic/stage1/json_scanner.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/json_character_block.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* A block of scanned json, with information on operators and scalars.
|
|
*
|
|
* We seek to identify pseudo-structural characters. Anything that is inside
|
|
* a string must be omitted (hence & ~_string.string_tail()).
|
|
* Otherwise, pseudo-structural characters come in two forms.
|
|
* 1. We have the structural characters ([,],{,},:, comma). The
|
|
* term 'structural character' is from the JSON RFC.
|
|
* 2. We have the 'scalar pseudo-structural characters'.
|
|
* Scalars are quotes, and any character except structural characters and white space.
|
|
*
|
|
* To identify the scalar pseudo-structural characters, we must look at what comes
|
|
* before them: it must be a space, a quote or a structural characters.
|
|
* Starting with simdjson v0.3, we identify them by
|
|
* negation: we identify everything that is followed by a non-quote scalar,
|
|
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
|
*/
|
|
struct json_block {
|
|
public:
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
|
|
/**
|
|
* The start of structurals.
|
|
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
|
**/
|
|
simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
|
|
/** All JSON whitespace (i.e. not in a string) */
|
|
simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
|
|
|
|
// Helpers
|
|
|
|
/** Whether the given characters are inside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
|
|
/** Whether the given characters are outside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
|
|
|
|
// string and escape characters
|
|
json_string_block _string;
|
|
// whitespace, structural characters ('operators'), scalars
|
|
json_character_block _characters;
|
|
// whether the previous character was a scalar
|
|
uint64_t _follows_potential_nonquote_scalar;
|
|
private:
|
|
// Potential structurals (i.e. disregarding strings)
|
|
|
|
/**
|
|
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
|
* They may reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
|
|
/**
|
|
* The start of non-operator runs, like 123, true and "abc".
|
|
* It main reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_scalar_start() const noexcept {
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
|
// then we know that it is irrelevant structurally.
|
|
return _characters.scalar() & ~follows_potential_scalar();
|
|
}
|
|
/**
|
|
* Whether the given character is immediately after a non-operator like 123, true.
|
|
* The characters following a quote are not included.
|
|
*/
|
|
simdjson_inline uint64_t follows_potential_scalar() const noexcept {
|
|
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
|
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
|
// white space.
|
|
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
|
return _follows_potential_nonquote_scalar;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
|
*
|
|
* The scanner starts by calculating two distinct things:
|
|
* - string characters (taking \" into account)
|
|
* - structural characters or 'operators' ([]{},:, comma)
|
|
* and scalars (runs of non-operators like 123, true and "abc")
|
|
*
|
|
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
|
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
|
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
|
|
* part of a string.
|
|
*/
|
|
class json_scanner {
|
|
public:
|
|
json_scanner() = default;
|
|
simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_inline error_code finish();
|
|
|
|
private:
|
|
// Whether the last character of the previous iteration is part of a scalar token
|
|
// (anything except whitespace or a structural character/'operator').
|
|
uint64_t prev_scalar = 0ULL;
|
|
json_string_scanner string_scanner{};
|
|
};
|
|
|
|
|
|
//
|
|
// Check if the current character immediately follows a matching character.
|
|
//
|
|
// For example, this checks for quotes with backslashes in front of them:
|
|
//
|
|
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
|
//
|
|
simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|
const uint64_t result = match << 1 | overflow;
|
|
overflow = match >> 63;
|
|
return result;
|
|
}
|
|
|
|
simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
json_string_block strings = string_scanner.next(in);
|
|
// identifies the white-space and the structural characters
|
|
json_character_block characters = json_character_block::classify(in);
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
|
//
|
|
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
|
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
|
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
|
// may need to add an extra check when parsing strings.
|
|
//
|
|
// Performance: there are many ways to skin this cat.
|
|
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
|
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_block(
|
|
strings,// strings is a function-local object so either it moves or the copy is elided.
|
|
characters,
|
|
follows_nonquote_scalar
|
|
);
|
|
}
|
|
|
|
simdjson_inline error_code json_scanner::finish() {
|
|
return string_scanner.finish();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
/* end file generic/stage1/json_scanner.h for haswell */
|
|
|
|
// All other declarations
|
|
/* including generic/stage1/find_next_document_index.h for haswell: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for haswell */
|
|
/* including generic/stage1/json_minifier.h for haswell: #include <generic/stage1/json_minifier.h> */
|
|
/* begin file generic/stage1/json_minifier.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class json_minifier {
|
|
public:
|
|
template<size_t STEP_SIZE>
|
|
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_minifier(uint8_t *_dst)
|
|
: dst{_dst}
|
|
{}
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
|
|
simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
|
|
json_scanner scanner{};
|
|
uint8_t *dst;
|
|
};
|
|
|
|
simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
|
uint64_t mask = block.whitespace();
|
|
dst += in.compress(mask, dst);
|
|
}
|
|
|
|
simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
|
error_code error = scanner.finish();
|
|
if (error) { dst_len = 0; return error; }
|
|
dst_len = dst - dst_start;
|
|
return SUCCESS;
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
simd::simd8x64<uint8_t> in_2(block_buf+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1);
|
|
this->next(in_2, block_2);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(block_buf, block_1);
|
|
reader.advance();
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_minifier minifier(dst);
|
|
|
|
// Index the first n-1 blocks
|
|
while (reader.has_full_block()) {
|
|
minifier.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
|
|
// Index the last (remainder) block, padded with spaces
|
|
uint8_t block[STEP_SIZE];
|
|
size_t remaining_bytes = reader.get_remainder(block);
|
|
if (remaining_bytes > 0) {
|
|
// We do not want to write directly to the output stream. Rather, we write
|
|
// to a local buffer (for safety).
|
|
uint8_t out_block[STEP_SIZE];
|
|
uint8_t * const guarded_dst{minifier.dst};
|
|
minifier.dst = out_block;
|
|
minifier.step<STEP_SIZE>(block, reader);
|
|
size_t to_write = minifier.dst - out_block;
|
|
// In some cases, we could be enticed to consider the padded spaces
|
|
// as part of the string. This is fine as long as we do not write more
|
|
// than we consumed.
|
|
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
|
|
memcpy(guarded_dst, out_block, to_write);
|
|
minifier.dst = guarded_dst + to_write;
|
|
}
|
|
return minifier.finish(dst, dst_len);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
/* end file generic/stage1/json_minifier.h for haswell */
|
|
/* including generic/stage1/json_structural_indexer.h for haswell: #include <generic/stage1/json_structural_indexer.h> */
|
|
/* begin file generic/stage1/json_structural_indexer.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_minifier.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/find_next_document_index.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer {
|
|
public:
|
|
uint32_t *tail;
|
|
|
|
simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
/**
|
|
* ARM lacks a fast trailing zero instruction, but it has a fast
|
|
* bit reversal instruction and a fast leading zero instruction.
|
|
* Thus it may be profitable to reverse the bits (once) and then
|
|
* to rely on a sequence of instructions that call the leading
|
|
* zero instruction.
|
|
*
|
|
* Performance notes:
|
|
* The chosen routine is not optimal in terms of data dependency
|
|
* since zero_leading_bit might require two instructions. However,
|
|
* it tends to minimize the total number of instructions which is
|
|
* beneficial.
|
|
*/
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
|
|
int lz = leading_zeroes(rev_bits);
|
|
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
|
rev_bits = zero_leading_bit(rev_bits, lz);
|
|
}
|
|
#else
|
|
/**
|
|
* Under recent x64 systems, we often have both a fast trailing zero
|
|
* instruction and a fast 'clear-lower-bit' instruction so the following
|
|
* algorithm can be competitive.
|
|
*/
|
|
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
|
|
this->tail[i] = idx + trailing_zeroes(bits);
|
|
bits = clear_lowest_bit(bits);
|
|
}
|
|
#endif // SIMDJSON_PREFER_REVERSE_BITS
|
|
|
|
template <int START, int N>
|
|
simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
|
|
write_index(idx, bits, START);
|
|
SIMDJSON_IF_CONSTEXPR (N > 1) {
|
|
write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
|
|
}
|
|
return START+N;
|
|
}
|
|
|
|
template <int START, int END, int STEP>
|
|
simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
|
|
write_indexes<START, STEP>(idx, bits);
|
|
SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) {
|
|
if (simdjson_unlikely((START+STEP) < cnt)) {
|
|
write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
|
|
}
|
|
}
|
|
return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
|
|
}
|
|
|
|
// flatten out values in 'bits' assuming that they are are to have values of idx
|
|
// plus their position in the bitvector, and store these indexes at
|
|
// base_ptr[base] incrementing base as we go
|
|
// will potentially store extra values beyond end of valid bits, so base_ptr
|
|
// needs to be large enough to handle this
|
|
//
|
|
// If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it
|
|
// will provide its own version of the code.
|
|
#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits);
|
|
#else
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0)
|
|
return;
|
|
|
|
int cnt = static_cast<int>(count_ones(bits));
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
bits = reverse_bits(bits);
|
|
#endif
|
|
#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
|
|
static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
|
|
#else
|
|
static constexpr const int STEP = 4;
|
|
#endif
|
|
static constexpr const int STEP_UNTIL = 24;
|
|
|
|
write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
|
|
SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
|
|
if (simdjson_unlikely(STEP_UNTIL < cnt)) {
|
|
for (int i=STEP_UNTIL; i<cnt; i++) {
|
|
write_index(idx, bits, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
this->tail += cnt;
|
|
}
|
|
#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
};
|
|
|
|
class json_structural_indexer {
|
|
public:
|
|
/**
|
|
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
|
|
*
|
|
* @param partial Setting the partial parameter to true allows the find_structural_bits to
|
|
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
|
|
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
|
*/
|
|
template<size_t STEP_SIZE>
|
|
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
|
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
|
|
|
json_scanner scanner{};
|
|
utf8_checker checker{};
|
|
bit_indexer indexer;
|
|
uint64_t prev_structurals = 0;
|
|
uint64_t unescaped_chars_error = 0;
|
|
};
|
|
|
|
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
|
|
|
|
// Skip the last character if it is partial
|
|
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
|
|
if (simdjson_unlikely(len < 3)) {
|
|
switch (len) {
|
|
case 2:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
|
|
return len;
|
|
case 1:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return len;
|
|
case 0:
|
|
return len;
|
|
}
|
|
}
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// PERF NOTES:
|
|
// We pipe 2 inputs through these stages:
|
|
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
|
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
|
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
|
|
// The output of step 1 depends entirely on this information. These functions don't quite use
|
|
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
|
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
|
// they can make a lot of progress before they need that information.
|
|
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
|
|
// to finish: utf-8 checks and generating the output from the last iteration.
|
|
//
|
|
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
|
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
|
// workout.
|
|
//
|
|
template<size_t STEP_SIZE>
|
|
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
|
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
|
// We guard the rest of the code so that we can assume that len > 0 throughout.
|
|
if (len == 0) { return EMPTY; }
|
|
if (is_streaming(partial)) {
|
|
len = trim_partial_utf8(buf, len);
|
|
// If you end up with an empty window after trimming
|
|
// the partial UTF-8 bytes, then chances are good that you
|
|
// have an UTF-8 formatting error.
|
|
if(len == 0) { return UTF8_ERROR; }
|
|
}
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_structural_indexer indexer(parser.structural_indexes.get());
|
|
|
|
// Read all but the last block
|
|
while (reader.has_full_block()) {
|
|
indexer.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
// Take care of the last block (will always be there unless file is empty which is
|
|
// not supposed to happen.)
|
|
uint8_t block[STEP_SIZE];
|
|
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
|
|
indexer.step<STEP_SIZE>(block, reader);
|
|
return indexer.finish(parser, reader.block_index(), len, partial);
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
simd::simd8x64<uint8_t> in_2(block+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
this->next(in_2, block_2, reader.block_index()+64);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
reader.advance();
|
|
}
|
|
|
|
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
|
|
uint64_t unescaped = in.lteq(0x1F);
|
|
#if SIMDJSON_UTF8VALIDATION
|
|
checker.check_next_input(in);
|
|
#endif
|
|
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
|
|
prev_structurals = block.structural_start();
|
|
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
|
}
|
|
|
|
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
|
// Write out the final iteration's structurals
|
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
|
error_code error = scanner.finish();
|
|
// We deliberately break down the next expression so that it is
|
|
// human readable.
|
|
const bool should_we_exit = is_streaming(partial) ?
|
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
|
|
|
if (unescaped_chars_error) {
|
|
return UNESCAPED_CHARS;
|
|
}
|
|
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
|
/***
|
|
* The On Demand API requires special padding.
|
|
*
|
|
* This is related to https://github.com/simdjson/simdjson/issues/906
|
|
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
|
* structural character, it quickly stops.
|
|
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
|
|
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
|
|
* continues, then it must be [,] or }.
|
|
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
|
|
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
|
|
* It can't be a comma, a colon or any simple value. So the only way we could continue is
|
|
* if the repeated character is [. But if so, the document must start with [. But if the document
|
|
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
|
* ][[ which is invalid.
|
|
*
|
|
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
|
* R"({ "a": [,,)"
|
|
**/
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
|
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
|
parser.next_structural_index = 0;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
return EMPTY;
|
|
}
|
|
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
|
return UNEXPECTED_ERROR;
|
|
}
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
// If we have an unclosed string, then the last structural
|
|
// will be the quote and we want to make sure to omit it.
|
|
if(have_unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if (partial == stage1_mode::streaming_final) {
|
|
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
|
// often load their data in bulk without being careful and they want us to ignore
|
|
// the trailing garbage.
|
|
return EMPTY;
|
|
}
|
|
}
|
|
checker.check_eof();
|
|
return checker.errors();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
/* end file generic/stage1/json_structural_indexer.h for haswell */
|
|
/* including generic/stage1/utf8_validator.h for haswell: #include <generic/stage1/utf8_validator.h> */
|
|
/* begin file generic/stage1/utf8_validator.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return c.errors() == error_code::SUCCESS;
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
/* end file generic/stage1/utf8_validator.h for haswell */
|
|
/* end file generic/stage1/amalgamated.h for haswell */
|
|
/* including generic/stage2/amalgamated.h for haswell: #include <generic/stage2/amalgamated.h> */
|
|
/* begin file generic/stage2/amalgamated.h for haswell */
|
|
// Stuff other things depend on
|
|
/* including generic/stage2/base.h for haswell: #include <generic/stage2/base.h> */
|
|
/* begin file generic/stage2/base.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator;
|
|
class structural_iterator;
|
|
struct tape_builder;
|
|
struct tape_writer;
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
/* end file generic/stage2/base.h for haswell */
|
|
/* including generic/stage2/tape_writer.h for haswell: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for haswell */
|
|
/* including generic/stage2/logger.h for haswell: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for haswell */
|
|
|
|
// All other declarations
|
|
/* including generic/stage2/json_iterator.h for haswell: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for haswell */
|
|
/* including generic/stage2/stringparsing.h for haswell: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for haswell */
|
|
/* including generic/stage2/structural_iterator.h for haswell: #include <generic/stage2/structural_iterator.h> */
|
|
/* begin file generic/stage2/structural_iterator.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class structural_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
|
|
// Start a structural
|
|
simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
// Get the buffer position of the current structural character
|
|
simdjson_inline const uint8_t* current() {
|
|
return &buf[*(next_structural-1)];
|
|
}
|
|
// Get the current structural character
|
|
simdjson_inline char current_char() {
|
|
return buf[*(next_structural-1)];
|
|
}
|
|
// Get the next structural character without advancing
|
|
simdjson_inline char peek_next_char() {
|
|
return buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* peek() {
|
|
return &buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* advance() {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline char advance_char() {
|
|
return buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t remaining_len() {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool at_end() {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool at_beginning() {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
};
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
/* end file generic/stage2/structural_iterator.h for haswell */
|
|
/* including generic/stage2/tape_builder.h for haswell: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for haswell */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for haswell */
|
|
/* end file generic/stage2/amalgamated.h for haswell */
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// This identifies structural characters (comma, colon, braces, brackets),
|
|
// and ASCII white-space ('\r','\n','\t',' ').
|
|
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
|
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
|
|
// we can't use the generic lookup_16.
|
|
const auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
|
|
|
|
// The 6 operators (:,[]{}) have these values:
|
|
//
|
|
// , 2C
|
|
// : 3A
|
|
// [ 5B
|
|
// { 7B
|
|
// ] 5D
|
|
// } 7D
|
|
//
|
|
// If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
|
|
// We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
|
|
// match it (against | 0x20).
|
|
//
|
|
// To prevent recognizing other characters, everything else gets compared with 0, which cannot
|
|
// match due to the | 0x20.
|
|
//
|
|
// NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
|
|
// and :. This gets caught in stage 2, which checks the actual character to ensure the right
|
|
// operators are in the right places.
|
|
const auto op_table = simd8<uint8_t>::repeat_16(
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0,
|
|
0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
|
|
',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D
|
|
);
|
|
|
|
// We compute whitespace and op separately. If later code only uses one or the
|
|
// other, given the fact that all functions are aggressively inlined, we can
|
|
// hope that useless computations will be omitted. This is namely case when
|
|
// minifying (we only need whitespace).
|
|
|
|
const uint64_t whitespace = in.eq({
|
|
_mm256_shuffle_epi8(whitespace_table, in.chunks[0]),
|
|
_mm256_shuffle_epi8(whitespace_table, in.chunks[1])
|
|
});
|
|
// Turn [ and ] into { and }
|
|
const simd8x64<uint8_t> curlified{
|
|
in.chunks[0] | 0x20,
|
|
in.chunks[1] | 0x20
|
|
};
|
|
const uint64_t op = curlified.eq({
|
|
_mm256_shuffle_epi8(op_table, in.chunks[0]),
|
|
_mm256_shuffle_epi8(op_table, in.chunks[1])
|
|
});
|
|
|
|
return { whitespace, op };
|
|
}
|
|
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
|
|
return is_third_byte | is_fourth_byte;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdjson {
|
|
namespace haswell {
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
|
|
}
|
|
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return haswell::stage1::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
|
|
return haswell::stringparsing::parse_string(src, dst, replacement_char);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return haswell::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace haswell
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/haswell/end.h: #include <simdjson/haswell/end.h> */
|
|
/* begin file simdjson/haswell/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_HASWELL
|
|
SIMDJSON_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "haswell" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/haswell/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_HASWELL_CPP
|
|
/* end file haswell.cpp */
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
/* including icelake.cpp: #include <icelake.cpp> */
|
|
/* begin file icelake.cpp */
|
|
#ifndef SIMDJSON_SRC_ICELAKE_CPP
|
|
#define SIMDJSON_SRC_ICELAKE_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/icelake.h: #include <simdjson/icelake.h> */
|
|
/* begin file simdjson/icelake.h */
|
|
#ifndef SIMDJSON_ICELAKE_H
|
|
#define SIMDJSON_ICELAKE_H
|
|
|
|
/* including simdjson/icelake/begin.h: #include "simdjson/icelake/begin.h" */
|
|
/* begin file simdjson/icelake/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "icelake" */
|
|
#define SIMDJSON_IMPLEMENTATION icelake
|
|
/* including simdjson/icelake/base.h: #include "simdjson/icelake/base.h" */
|
|
/* begin file simdjson/icelake/base.h */
|
|
#ifndef SIMDJSON_ICELAKE_BASE_H
|
|
#define SIMDJSON_ICELAKE_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_ICELAKE
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Icelake (Intel AVX512).
|
|
*/
|
|
namespace icelake {
|
|
|
|
class implementation;
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_BASE_H
|
|
/* end file simdjson/icelake/base.h */
|
|
/* including simdjson/icelake/intrinsics.h: #include "simdjson/icelake/intrinsics.h" */
|
|
/* begin file simdjson/icelake/intrinsics.h */
|
|
#ifndef SIMDJSON_ICELAKE_INTRINSICS_H
|
|
#define SIMDJSON_ICELAKE_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdjson, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
// Important: we need the AVX-512 headers:
|
|
#include <avx512fintrin.h>
|
|
#include <avx512dqintrin.h>
|
|
#include <avx512cdintrin.h>
|
|
#include <avx512bwintrin.h>
|
|
#include <avx512vlintrin.h>
|
|
#include <avx512vbmiintrin.h>
|
|
#include <avx512vbmi2intrin.h>
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) ((n - 1) & n)
|
|
#endif // _blsr_u64
|
|
#endif // SIMDJSON_CLANG_VISUAL_STUDIO
|
|
|
|
static_assert(sizeof(__m512i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for icelake");
|
|
|
|
#endif // SIMDJSON_ICELAKE_INTRINSICS_H
|
|
/* end file simdjson/icelake/intrinsics.h */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
|
|
SIMDJSON_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,pclmul,lzcnt,popcnt")
|
|
#endif
|
|
|
|
/* including simdjson/icelake/bitmanipulation.h: #include "simdjson/icelake/bitmanipulation.h" */
|
|
/* begin file simdjson/icelake/bitmanipulation.h */
|
|
#ifndef SIMDJSON_ICELAKE_BITMANIPULATION_H
|
|
#define SIMDJSON_ICELAKE_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return (int)_tzcnt_u64(input_num);
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
////////
|
|
// You might expect the next line to be equivalent to
|
|
// return (int)_tzcnt_u64(input_num);
|
|
// but the generated code differs and might be less efficient?
|
|
////////
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return _blsr_u64(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return int(_lzcnt_u64(input_num));
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_BITMANIPULATION_H
|
|
/* end file simdjson/icelake/bitmanipulation.h */
|
|
/* including simdjson/icelake/bitmask.h: #include "simdjson/icelake/bitmask.h" */
|
|
/* begin file simdjson/icelake/bitmask.h */
|
|
#ifndef SIMDJSON_ICELAKE_BITMASK_H
|
|
#define SIMDJSON_ICELAKE_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
|
|
// There should be no such thing with a processor supporting avx2
|
|
// but not clmul.
|
|
__m128i all_ones = _mm_set1_epi8('\xFF');
|
|
__m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
|
|
return _mm_cvtsi128_si64(result);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_BITMASK_H
|
|
/* end file simdjson/icelake/bitmask.h */
|
|
/* including simdjson/icelake/simd.h: #include "simdjson/icelake/simd.h" */
|
|
/* begin file simdjson/icelake/simd.h */
|
|
#ifndef SIMDJSON_ICELAKE_SIMD_H
|
|
#define SIMDJSON_ICELAKE_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if defined(__GNUC__) && !defined(__clang__)
|
|
#if __GNUC__ == 8
|
|
#define SIMDJSON_GCC8 1
|
|
#endif // __GNUC__ == 8
|
|
#endif // defined(__GNUC__) && !defined(__clang__)
|
|
|
|
#if SIMDJSON_GCC8
|
|
/**
|
|
* GCC 8 fails to provide _mm512_set_epi8. We roll our own.
|
|
*/
|
|
inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
|
|
return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
|
|
uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
|
|
uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
|
|
uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
|
|
uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
|
|
uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
|
|
uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
|
|
uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
|
|
}
|
|
#endif // SIMDJSON_GCC8
|
|
|
|
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m512i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m512i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m512i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m512i&() const { return this->value; }
|
|
simdjson_inline operator __m512i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm512_or_si512(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm512_and_si512(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm512_xor_si512(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm512_andnot_si512(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint32_t bitmask_t;
|
|
typedef uint64_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m512i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline uint64_t operator==(const simd8<T> lhs, const simd8<T> rhs) {
|
|
return _mm512_cmpeq_epi8_mask(lhs, rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<T>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
// workaround for compilers unable to figure out that 16 - N is a constant (GCC 8)
|
|
constexpr int shift = 16 - N;
|
|
return _mm512_alignr_epi8(*this, _mm512_permutex2var_epi64(prev_chunk, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), *this), shift);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm512_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m512i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
simdjson_inline bool any() const { return !!_mm512_test_epi8_mask (*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm512_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm512_setzero_si512(); }
|
|
static simdjson_inline simd8<T> load(const T values[64]) {
|
|
return _mm512_loadu_si512(reinterpret_cast<const __m512i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m512i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[64]) const { return _mm512_storeu_si512(reinterpret_cast<__m512i *>(dst), *this); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm512_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm512_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm512_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint64_t mask, L * output) const {
|
|
_mm512_mask_compressstoreu_epi8 (output,~mask,*this);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m512i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[64]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
|
|
int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
|
|
int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31,
|
|
int8_t v32, int8_t v33, int8_t v34, int8_t v35, int8_t v36, int8_t v37, int8_t v38, int8_t v39,
|
|
int8_t v40, int8_t v41, int8_t v42, int8_t v43, int8_t v44, int8_t v45, int8_t v46, int8_t v47,
|
|
int8_t v48, int8_t v49, int8_t v50, int8_t v51, int8_t v52, int8_t v53, int8_t v54, int8_t v55,
|
|
int8_t v56, int8_t v57, int8_t v58, int8_t v59, int8_t v60, int8_t v61, int8_t v62, int8_t v63
|
|
) : simd8(_mm512_set_epi8(
|
|
v63, v62, v61, v60, v59, v58, v57, v56,
|
|
v55, v54, v53, v52, v51, v50, v49, v48,
|
|
v47, v46, v45, v44, v43, v42, v41, v40,
|
|
v39, v38, v37, v36, v35, v34, v33, v32,
|
|
v31, v30, v29, v28, v27, v26, v25, v24,
|
|
v23, v22, v21, v20, v19, v18, v17, v16,
|
|
v15, v14, v13, v12, v11, v10, v9, v8,
|
|
v7, v6, v5, v4, v3, v2, v1, v0
|
|
)) {}
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm512_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm512_min_epi8(*this, other); }
|
|
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(*this, other),_mm512_set1_epi8(uint8_t(0x80))); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(other, *this),_mm512_set1_epi8(uint8_t(0x80))); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m512i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[64]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
|
|
uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31,
|
|
uint8_t v32, uint8_t v33, uint8_t v34, uint8_t v35, uint8_t v36, uint8_t v37, uint8_t v38, uint8_t v39,
|
|
uint8_t v40, uint8_t v41, uint8_t v42, uint8_t v43, uint8_t v44, uint8_t v45, uint8_t v46, uint8_t v47,
|
|
uint8_t v48, uint8_t v49, uint8_t v50, uint8_t v51, uint8_t v52, uint8_t v53, uint8_t v54, uint8_t v55,
|
|
uint8_t v56, uint8_t v57, uint8_t v58, uint8_t v59, uint8_t v60, uint8_t v61, uint8_t v62, uint8_t v63
|
|
) : simd8(_mm512_set_epi8(
|
|
v63, v62, v61, v60, v59, v58, v57, v56,
|
|
v55, v54, v53, v52, v51, v50, v49, v48,
|
|
v47, v46, v45, v44, v43, v42, v41, v40,
|
|
v39, v38, v37, v36, v35, v34, v33, v32,
|
|
v31, v30, v29, v28, v27, v26, v25, v24,
|
|
v23, v22, v21, v20, v19, v18, v17, v16,
|
|
v15, v14, v13, v12, v11, v10, v9, v8,
|
|
v7, v6, v5, v4, v3, v2, v1, v0
|
|
)) {}
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm512_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm512_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm512_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm512_min_epu8(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline uint64_t operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline uint64_t operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return _mm512_mask_blend_epi8(*this == uint8_t(0), _mm512_set1_epi8(0), _mm512_set1_epi8(-1)); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
|
|
simdjson_inline bool is_ascii() const { return _mm512_movepi8_mask(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const {
|
|
return !_mm512_test_epi8_mask(*this, *this);
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return !_mm512_test_epi8_mask(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm512_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm512_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline uint64_t get_bit() const { return _mm512_movepi8_mask(_mm512_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 1, "Icelake kernel should use one register per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdjson_inline simd8x64(const simd8<T> chunk0) : chunks{chunk0} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
this->chunks[0].compress(mask, output);
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0];
|
|
}
|
|
|
|
simdjson_inline simd8x64<T> bit_or(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<T>(
|
|
this->chunks[0] | mask
|
|
);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return this->chunks[0] == mask;
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return this->chunks[0] == other.chunks[0];
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return this->chunks[0] <= mask;
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_SIMD_H
|
|
/* end file simdjson/icelake/simd.h */
|
|
/* including simdjson/icelake/stringparsing_defs.h: #include "simdjson/icelake/stringparsing_defs.h" */
|
|
/* begin file simdjson/icelake/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 64;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint64_t bs_bits;
|
|
uint64_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 15 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v(src);
|
|
// store to dest unconditionally - we can overwrite the bits we don't like later
|
|
v.store(dst);
|
|
return {
|
|
static_cast<uint64_t>(v == '\\'), // bs_bits
|
|
static_cast<uint64_t>(v == '"'), // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/icelake/stringparsing_defs.h */
|
|
/* including simdjson/icelake/numberparsing_defs.h: #include "simdjson/icelake/numberparsing_defs.h" */
|
|
/* begin file simdjson/icelake/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace numberparsing {
|
|
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
// this actually computes *16* values so we are being wasteful.
|
|
const __m128i ascii0 = _mm_set1_epi8('0');
|
|
const __m128i mul_1_10 =
|
|
_mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
|
|
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
|
|
const __m128i mul_1_10000 =
|
|
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
|
|
const __m128i input = _mm_sub_epi8(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
|
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
|
|
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
|
|
const __m128i t3 = _mm_packus_epi32(t2, t2);
|
|
const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
|
|
return _mm_cvtsi128_si32(
|
|
t4); // only captures the sum of the first 8 digits, drop the rest
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/icelake/numberparsing_defs.h */
|
|
/* end file simdjson/icelake/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for icelake: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for icelake */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for icelake: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for icelake */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for icelake */
|
|
/* including simdjson/generic/jsoncharutils.h for icelake: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for icelake */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for icelake */
|
|
/* including simdjson/generic/atomparsing.h for icelake: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for icelake */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for icelake */
|
|
/* including simdjson/generic/dom_parser_implementation.h for icelake: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for icelake */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for icelake */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for icelake: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for icelake */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for icelake */
|
|
/* including simdjson/generic/numberparsing.h for icelake: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for icelake */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for icelake */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for icelake: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for icelake */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for icelake */
|
|
/* end file simdjson/generic/amalgamated.h for icelake */
|
|
/* including simdjson/icelake/end.h: #include "simdjson/icelake/end.h" */
|
|
/* begin file simdjson/icelake/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
|
|
SIMDJSON_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "icelake" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/icelake/end.h */
|
|
|
|
#endif // SIMDJSON_ICELAKE_H
|
|
/* end file simdjson/icelake.h */
|
|
/* including simdjson/icelake/implementation.h: #include <simdjson/icelake/implementation.h> */
|
|
/* begin file simdjson/icelake/implementation.h */
|
|
#ifndef SIMDJSON_ICELAKE_IMPLEMENTATION_H
|
|
#define SIMDJSON_ICELAKE_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_ICELAKE
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation(
|
|
"icelake",
|
|
"Intel/AMD AVX512",
|
|
internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512F | internal::instruction_set::AVX512DQ | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2
|
|
) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_IMPLEMENTATION_H
|
|
/* end file simdjson/icelake/implementation.h */
|
|
|
|
// defining SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER allows us to provide our own bit_indexer::write
|
|
#define SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
/* including simdjson/icelake/begin.h: #include <simdjson/icelake/begin.h> */
|
|
/* begin file simdjson/icelake/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "icelake" */
|
|
#define SIMDJSON_IMPLEMENTATION icelake
|
|
/* including simdjson/icelake/base.h: #include "simdjson/icelake/base.h" */
|
|
/* begin file simdjson/icelake/base.h */
|
|
#ifndef SIMDJSON_ICELAKE_BASE_H
|
|
#define SIMDJSON_ICELAKE_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_ICELAKE
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Icelake (Intel AVX512).
|
|
*/
|
|
namespace icelake {
|
|
|
|
class implementation;
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_BASE_H
|
|
/* end file simdjson/icelake/base.h */
|
|
/* including simdjson/icelake/intrinsics.h: #include "simdjson/icelake/intrinsics.h" */
|
|
/* begin file simdjson/icelake/intrinsics.h */
|
|
#ifndef SIMDJSON_ICELAKE_INTRINSICS_H
|
|
#define SIMDJSON_ICELAKE_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
* e.g., if __AVX2__ is set... in turn, we normally set these
|
|
* macros by compiling against the corresponding architecture
|
|
* (e.g., arch:AVX2, -mavx2, etc.) which compiles the whole
|
|
* software with these advanced instructions. In simdjson, we
|
|
* want to compile the whole program for a generic target,
|
|
* and only target our specific kernels. As a workaround,
|
|
* we directly include the needed headers. These headers would
|
|
* normally guard against such usage, but we carefully included
|
|
* <x86intrin.h> (or <intrin.h>) before, so the headers
|
|
* are fooled.
|
|
*/
|
|
#include <bmiintrin.h> // for _blsr_u64
|
|
#include <lzcntintrin.h> // for __lzcnt64
|
|
#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#include <avxintrin.h>
|
|
#include <avx2intrin.h>
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
// Important: we need the AVX-512 headers:
|
|
#include <avx512fintrin.h>
|
|
#include <avx512dqintrin.h>
|
|
#include <avx512cdintrin.h>
|
|
#include <avx512bwintrin.h>
|
|
#include <avx512vlintrin.h>
|
|
#include <avx512vbmiintrin.h>
|
|
#include <avx512vbmi2intrin.h>
|
|
// unfortunately, we may not get _blsr_u64, but, thankfully, clang
|
|
// has it as a macro.
|
|
#ifndef _blsr_u64
|
|
// we roll our own
|
|
#define _blsr_u64(n) ((n - 1) & n)
|
|
#endif // _blsr_u64
|
|
#endif // SIMDJSON_CLANG_VISUAL_STUDIO
|
|
|
|
static_assert(sizeof(__m512i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for icelake");
|
|
|
|
#endif // SIMDJSON_ICELAKE_INTRINSICS_H
|
|
/* end file simdjson/icelake/intrinsics.h */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
|
|
SIMDJSON_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,pclmul,lzcnt,popcnt")
|
|
#endif
|
|
|
|
/* including simdjson/icelake/bitmanipulation.h: #include "simdjson/icelake/bitmanipulation.h" */
|
|
/* begin file simdjson/icelake/bitmanipulation.h */
|
|
#ifndef SIMDJSON_ICELAKE_BITMANIPULATION_H
|
|
#define SIMDJSON_ICELAKE_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return (int)_tzcnt_u64(input_num);
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
////////
|
|
// You might expect the next line to be equivalent to
|
|
// return (int)_tzcnt_u64(input_num);
|
|
// but the generated code differs and might be less efficient?
|
|
////////
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return _blsr_u64(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return int(_lzcnt_u64(input_num));
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_BITMANIPULATION_H
|
|
/* end file simdjson/icelake/bitmanipulation.h */
|
|
/* including simdjson/icelake/bitmask.h: #include "simdjson/icelake/bitmask.h" */
|
|
/* begin file simdjson/icelake/bitmask.h */
|
|
#ifndef SIMDJSON_ICELAKE_BITMASK_H
|
|
#define SIMDJSON_ICELAKE_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
|
|
// There should be no such thing with a processor supporting avx2
|
|
// but not clmul.
|
|
__m128i all_ones = _mm_set1_epi8('\xFF');
|
|
__m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
|
|
return _mm_cvtsi128_si64(result);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_BITMASK_H
|
|
/* end file simdjson/icelake/bitmask.h */
|
|
/* including simdjson/icelake/simd.h: #include "simdjson/icelake/simd.h" */
|
|
/* begin file simdjson/icelake/simd.h */
|
|
#ifndef SIMDJSON_ICELAKE_SIMD_H
|
|
#define SIMDJSON_ICELAKE_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if defined(__GNUC__) && !defined(__clang__)
|
|
#if __GNUC__ == 8
|
|
#define SIMDJSON_GCC8 1
|
|
#endif // __GNUC__ == 8
|
|
#endif // defined(__GNUC__) && !defined(__clang__)
|
|
|
|
#if SIMDJSON_GCC8
|
|
/**
|
|
* GCC 8 fails to provide _mm512_set_epi8. We roll our own.
|
|
*/
|
|
inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
|
|
return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
|
|
uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
|
|
uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
|
|
uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
|
|
uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
|
|
uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
|
|
uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
|
|
uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
|
|
}
|
|
#endif // SIMDJSON_GCC8
|
|
|
|
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m512i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m512i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m512i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m512i&() const { return this->value; }
|
|
simdjson_inline operator __m512i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm512_or_si512(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm512_and_si512(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm512_xor_si512(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm512_andnot_si512(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint32_t bitmask_t;
|
|
typedef uint64_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m512i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline uint64_t operator==(const simd8<T> lhs, const simd8<T> rhs) {
|
|
return _mm512_cmpeq_epi8_mask(lhs, rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<T>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
// workaround for compilers unable to figure out that 16 - N is a constant (GCC 8)
|
|
constexpr int shift = 16 - N;
|
|
return _mm512_alignr_epi8(*this, _mm512_permutex2var_epi64(prev_chunk, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), *this), shift);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm512_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m512i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
simdjson_inline bool any() const { return !!_mm512_test_epi8_mask (*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm512_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm512_setzero_si512(); }
|
|
static simdjson_inline simd8<T> load(const T values[64]) {
|
|
return _mm512_loadu_si512(reinterpret_cast<const __m512i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m512i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[64]) const { return _mm512_storeu_si512(reinterpret_cast<__m512i *>(dst), *this); }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm512_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm512_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm512_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint64_t mask, L * output) const {
|
|
_mm512_mask_compressstoreu_epi8 (output,~mask,*this);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m512i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[64]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
|
|
int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
|
|
int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31,
|
|
int8_t v32, int8_t v33, int8_t v34, int8_t v35, int8_t v36, int8_t v37, int8_t v38, int8_t v39,
|
|
int8_t v40, int8_t v41, int8_t v42, int8_t v43, int8_t v44, int8_t v45, int8_t v46, int8_t v47,
|
|
int8_t v48, int8_t v49, int8_t v50, int8_t v51, int8_t v52, int8_t v53, int8_t v54, int8_t v55,
|
|
int8_t v56, int8_t v57, int8_t v58, int8_t v59, int8_t v60, int8_t v61, int8_t v62, int8_t v63
|
|
) : simd8(_mm512_set_epi8(
|
|
v63, v62, v61, v60, v59, v58, v57, v56,
|
|
v55, v54, v53, v52, v51, v50, v49, v48,
|
|
v47, v46, v45, v44, v43, v42, v41, v40,
|
|
v39, v38, v37, v36, v35, v34, v33, v32,
|
|
v31, v30, v29, v28, v27, v26, v25, v24,
|
|
v23, v22, v21, v20, v19, v18, v17, v16,
|
|
v15, v14, v13, v12, v11, v10, v9, v8,
|
|
v7, v6, v5, v4, v3, v2, v1, v0
|
|
)) {}
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm512_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm512_min_epi8(*this, other); }
|
|
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(*this, other),_mm512_set1_epi8(uint8_t(0x80))); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm512_maskz_abs_epi8(_mm512_cmpgt_epi8_mask(other, *this),_mm512_set1_epi8(uint8_t(0x80))); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m512i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[64]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
|
|
uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31,
|
|
uint8_t v32, uint8_t v33, uint8_t v34, uint8_t v35, uint8_t v36, uint8_t v37, uint8_t v38, uint8_t v39,
|
|
uint8_t v40, uint8_t v41, uint8_t v42, uint8_t v43, uint8_t v44, uint8_t v45, uint8_t v46, uint8_t v47,
|
|
uint8_t v48, uint8_t v49, uint8_t v50, uint8_t v51, uint8_t v52, uint8_t v53, uint8_t v54, uint8_t v55,
|
|
uint8_t v56, uint8_t v57, uint8_t v58, uint8_t v59, uint8_t v60, uint8_t v61, uint8_t v62, uint8_t v63
|
|
) : simd8(_mm512_set_epi8(
|
|
v63, v62, v61, v60, v59, v58, v57, v56,
|
|
v55, v54, v53, v52, v51, v50, v49, v48,
|
|
v47, v46, v45, v44, v43, v42, v41, v40,
|
|
v39, v38, v37, v36, v35, v34, v33, v32,
|
|
v31, v30, v29, v28, v27, v26, v25, v24,
|
|
v23, v22, v21, v20, v19, v18, v17, v16,
|
|
v15, v14, v13, v12, v11, v10, v9, v8,
|
|
v7, v6, v5, v4, v3, v2, v1, v0
|
|
)) {}
|
|
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm512_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm512_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm512_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm512_min_epu8(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline uint64_t operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline uint64_t operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return _mm512_mask_blend_epi8(*this == uint8_t(0), _mm512_set1_epi8(0), _mm512_set1_epi8(-1)); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
|
|
simdjson_inline bool is_ascii() const { return _mm512_movepi8_mask(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const {
|
|
return !_mm512_test_epi8_mask(*this, *this);
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return !_mm512_test_epi8_mask(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm512_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm512_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline uint64_t get_bit() const { return _mm512_movepi8_mask(_mm512_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 1, "Icelake kernel should use one register per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdjson_inline simd8x64(const simd8<T> chunk0) : chunks{chunk0} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
this->chunks[0].compress(mask, output);
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0];
|
|
}
|
|
|
|
simdjson_inline simd8x64<T> bit_or(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<T>(
|
|
this->chunks[0] | mask
|
|
);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return this->chunks[0] == mask;
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return this->chunks[0] == other.chunks[0];
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return this->chunks[0] <= mask;
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_SIMD_H
|
|
/* end file simdjson/icelake/simd.h */
|
|
/* including simdjson/icelake/stringparsing_defs.h: #include "simdjson/icelake/stringparsing_defs.h" */
|
|
/* begin file simdjson/icelake/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 64;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return ((quote_bits - 1) & bs_bits) != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint64_t bs_bits;
|
|
uint64_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 15 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v(src);
|
|
// store to dest unconditionally - we can overwrite the bits we don't like later
|
|
v.store(dst);
|
|
return {
|
|
static_cast<uint64_t>(v == '\\'), // bs_bits
|
|
static_cast<uint64_t>(v == '"'), // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_ICELAKE_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/icelake/stringparsing_defs.h */
|
|
/* including simdjson/icelake/numberparsing_defs.h: #include "simdjson/icelake/numberparsing_defs.h" */
|
|
/* begin file simdjson/icelake/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace numberparsing {
|
|
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
// this actually computes *16* values so we are being wasteful.
|
|
const __m128i ascii0 = _mm_set1_epi8('0');
|
|
const __m128i mul_1_10 =
|
|
_mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
|
|
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
|
|
const __m128i mul_1_10000 =
|
|
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
|
|
const __m128i input = _mm_sub_epi8(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
|
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
|
|
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
|
|
const __m128i t3 = _mm_packus_epi32(t2, t2);
|
|
const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
|
|
return _mm_cvtsi128_si32(
|
|
t4); // only captures the sum of the first 8 digits, drop the rest
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_ICELAKE_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/icelake/numberparsing_defs.h */
|
|
/* end file simdjson/icelake/begin.h */
|
|
/* including generic/amalgamated.h for icelake: #include <generic/amalgamated.h> */
|
|
/* begin file generic/amalgamated.h for icelake */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H)
|
|
#error generic/dependencies.h must be included before generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including generic/base.h for icelake: #include <generic/base.h> */
|
|
/* begin file generic/base.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
struct json_character_block;
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_BASE_H
|
|
/* end file generic/base.h for icelake */
|
|
/* including generic/dom_parser_implementation.h for icelake: #include <generic/dom_parser_implementation.h> */
|
|
/* begin file generic/dom_parser_implementation.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// Interface a dom parser implementation must fulfill
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3);
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input);
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file generic/dom_parser_implementation.h for icelake */
|
|
/* including generic/json_character_block.h for icelake: #include <generic/json_character_block.h> */
|
|
/* begin file generic/json_character_block.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
|
|
struct json_character_block {
|
|
static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
|
|
|
simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
|
|
simdjson_inline uint64_t op() const noexcept { return _op; }
|
|
simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
|
|
|
|
uint64_t _whitespace;
|
|
uint64_t _op;
|
|
};
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
/* end file generic/json_character_block.h for icelake */
|
|
/* end file generic/amalgamated.h for icelake */
|
|
/* including generic/stage1/amalgamated.h for icelake: #include <generic/stage1/amalgamated.h> */
|
|
/* begin file generic/stage1/amalgamated.h for icelake */
|
|
// Stuff other things depend on
|
|
/* including generic/stage1/base.h for icelake: #include <generic/stage1/base.h> */
|
|
/* begin file generic/stage1/base.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer;
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader;
|
|
struct json_block;
|
|
class json_minifier;
|
|
class json_scanner;
|
|
struct json_string_block;
|
|
class json_string_scanner;
|
|
class json_structural_indexer;
|
|
|
|
} // namespace stage1
|
|
|
|
namespace utf8_validation {
|
|
struct utf8_checker;
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
/* end file generic/stage1/base.h for icelake */
|
|
/* including generic/stage1/buf_block_reader.h for icelake: #include <generic/stage1/buf_block_reader.h> */
|
|
/* begin file generic/stage1/buf_block_reader.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdjson_inline size_t block_index();
|
|
simdjson_inline bool has_full_block() const;
|
|
simdjson_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdjson_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdjson_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in, uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] <= ' ') { buf[i] = '_'; }
|
|
if (!(mask & (size_t(1) << i))) { buf[i] = ' '; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_mask(uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
/* end file generic/stage1/buf_block_reader.h for icelake */
|
|
/* including generic/stage1/json_escape_scanner.h for icelake: #include <generic/stage1/json_escape_scanner.h> */
|
|
/* begin file generic/stage1/json_escape_scanner.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n).
|
|
*/
|
|
struct json_escape_scanner {
|
|
/** The actual escape characters (the backslashes themselves). */
|
|
uint64_t next_is_escaped = 0ULL;
|
|
|
|
struct escaped_and_escape {
|
|
/**
|
|
* Mask of escaped characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 0100100010100101000
|
|
* n \ \ n \ \
|
|
* ```
|
|
*/
|
|
uint64_t escaped;
|
|
/**
|
|
* Mask of escape characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 1001000101001010001
|
|
* \ \ \ \ \ \ \
|
|
* ```
|
|
*/
|
|
uint64_t escape;
|
|
};
|
|
|
|
/**
|
|
* Get a mask of both escape and escaped characters (the characters following a backslash).
|
|
*
|
|
* @param potential_escape A mask of the character that can escape others (but could be
|
|
* escaped itself). e.g. block.eq('\\')
|
|
*/
|
|
simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept {
|
|
|
|
#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
if (!backslash) { return {next_escaped_without_backslashes(), 0}; }
|
|
#endif
|
|
|
|
// | | Mask (shows characters instead of 1's) | Depth | Instructions |
|
|
// |--------------------------------|----------------------------------------|-------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | |
|
|
// | | ` even odd even odd odd` | | |
|
|
// | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code())
|
|
// | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped))
|
|
// | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash)
|
|
// | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) ()
|
|
// (*) this is not needed until the next iteration
|
|
uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped);
|
|
uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped);
|
|
uint64_t escape = escape_and_terminal_code & backslash;
|
|
this->next_is_escaped = escape >> 63;
|
|
return {escaped, escape};
|
|
}
|
|
|
|
private:
|
|
static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL;
|
|
|
|
simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept {
|
|
uint64_t escaped = this->next_is_escaped;
|
|
this->next_is_escaped = 0;
|
|
return escaped;
|
|
}
|
|
|
|
/**
|
|
* Returns a mask of the next escape characters (masking out escaped backslashes), along with
|
|
* any non-backslash escape codes.
|
|
*
|
|
* \n \\n \\\n \\\\n returns:
|
|
* \n \ \ \n \ \
|
|
* 11 100 1011 10100
|
|
*
|
|
* You are expected to mask out the first bit yourself if the previous block had a trailing
|
|
* escape.
|
|
*
|
|
* & the result with potential_escape to get just the escape characters.
|
|
* ^ the result with (potential_escape | first_is_escaped) to get escaped characters.
|
|
*/
|
|
static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept {
|
|
// If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer:
|
|
// any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be
|
|
// inverted (\\\ would be 010 instead of 101).
|
|
//
|
|
// ```
|
|
// string: | ____\\\\_\\\\_____ |
|
|
// maybe_escaped | ODD | \ \ \ \ |
|
|
// even-aligned ^^^ ^^^^ odd-aligned
|
|
// ```
|
|
//
|
|
// Taking that into account, our basic strategy is:
|
|
//
|
|
// 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for
|
|
// odd-aligned runs.
|
|
// 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the
|
|
// odd bits in odd-aligned runs.
|
|
// 3. & with backslash to clean up any stray bits.
|
|
// runs are set to 0, and then XORing with "odd":
|
|
//
|
|
// | | Mask (shows characters instead of 1's) | Instructions |
|
|
// |--------------------------------|----------------------------------------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` |
|
|
// | | ` even odd even odd odd` |
|
|
// | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1)
|
|
// | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd)
|
|
// | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd)
|
|
//
|
|
|
|
// Escaped characters are characters following an escape.
|
|
uint64_t maybe_escaped = potential_escape << 1;
|
|
|
|
// To distinguish odd from even escape sequences, therefore, we turn on any *starting*
|
|
// escapes that are on an odd byte. (We actually bring in all odd bits, for speed.)
|
|
// - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1.
|
|
// - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0.
|
|
// - All other odd bytes are 1, and even bytes are 0.
|
|
uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS;
|
|
uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape;
|
|
|
|
// Now we flip all odd bytes back with xor. This:
|
|
// - Makes odd runs of backslashes go from 0000 to 1010
|
|
// - Makes even runs of backslashes go from 1111 to 1010
|
|
// - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100)
|
|
// - Resets all other bytes to 0
|
|
return even_series_codes_and_odd_bits ^ ODD_BITS;
|
|
}
|
|
};
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_escape_scanner.h for icelake */
|
|
/* including generic/stage1/json_string_scanner.h for icelake: #include <generic/stage1/json_string_scanner.h> */
|
|
/* begin file generic/stage1/json_string_scanner.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_escape_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
struct json_string_block {
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) :
|
|
_escaped(escaped), _quote(quote), _in_string(in_string) {}
|
|
|
|
// Escaped characters (characters following an escape() character)
|
|
simdjson_really_inline uint64_t escaped() const { return _escaped; }
|
|
// Real (non-backslashed) quotes
|
|
simdjson_really_inline uint64_t quote() const { return _quote; }
|
|
// Only characters inside the string (not including the quotes)
|
|
simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
|
|
// Tail of string (everything except the start quote)
|
|
simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
|
|
|
|
// escaped characters (backslashed--does not include the hex characters after \u)
|
|
uint64_t _escaped;
|
|
// real quotes (non-escaped ones)
|
|
uint64_t _quote;
|
|
// string characters (includes start quote but not end quote)
|
|
uint64_t _in_string;
|
|
};
|
|
|
|
// Scans blocks for string characters, storing the state necessary to do so
|
|
class json_string_scanner {
|
|
public:
|
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_really_inline error_code finish();
|
|
|
|
private:
|
|
// Scans for escape characters
|
|
json_escape_scanner escape_scanner{};
|
|
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
|
uint64_t prev_in_string = 0ULL;
|
|
};
|
|
|
|
//
|
|
// Return a mask of all string characters plus end quotes.
|
|
//
|
|
// prev_escaped is overflow saying whether the next character is escaped.
|
|
// prev_in_string is overflow saying whether we're still in a string.
|
|
//
|
|
// Backslash sequences outside of quotes will be detected in stage 2.
|
|
//
|
|
simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
const uint64_t backslash = in.eq('\\');
|
|
const uint64_t escaped = escape_scanner.next(backslash).escaped;
|
|
const uint64_t quote = in.eq('"') & ~escaped;
|
|
|
|
//
|
|
// prefix_xor flips on bits inside the string (and flips off the end quote).
|
|
//
|
|
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
|
|
// (characters inside strings are outside, and characters outside strings are inside).
|
|
//
|
|
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
|
|
|
|
//
|
|
// Check if we're still in a string at the end of the box so the next block will know
|
|
//
|
|
prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
|
|
|
|
// Use ^ to turn the beginning quote off, and the end quote on.
|
|
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_string_block(escaped, quote, in_string);
|
|
}
|
|
|
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
|
if (prev_in_string) {
|
|
return UNCLOSED_STRING;
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_string_scanner.h for icelake */
|
|
/* including generic/stage1/utf8_lookup4_algorithm.h for icelake: #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* begin file generic/stage1/utf8_lookup4_algorithm.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = must_be_2_3_continuation(prev2, prev3);
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
static const uint8_t max_array[64] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#else
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#endif
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdjson_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdjson_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
|
|
||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
|
|
|| (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support one, two or four chunks per 64-byte block.");
|
|
SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdjson_inline error_code errors() {
|
|
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
/* end file generic/stage1/utf8_lookup4_algorithm.h for icelake */
|
|
/* including generic/stage1/json_scanner.h for icelake: #include <generic/stage1/json_scanner.h> */
|
|
/* begin file generic/stage1/json_scanner.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/json_character_block.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* A block of scanned json, with information on operators and scalars.
|
|
*
|
|
* We seek to identify pseudo-structural characters. Anything that is inside
|
|
* a string must be omitted (hence & ~_string.string_tail()).
|
|
* Otherwise, pseudo-structural characters come in two forms.
|
|
* 1. We have the structural characters ([,],{,},:, comma). The
|
|
* term 'structural character' is from the JSON RFC.
|
|
* 2. We have the 'scalar pseudo-structural characters'.
|
|
* Scalars are quotes, and any character except structural characters and white space.
|
|
*
|
|
* To identify the scalar pseudo-structural characters, we must look at what comes
|
|
* before them: it must be a space, a quote or a structural characters.
|
|
* Starting with simdjson v0.3, we identify them by
|
|
* negation: we identify everything that is followed by a non-quote scalar,
|
|
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
|
*/
|
|
struct json_block {
|
|
public:
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
|
|
/**
|
|
* The start of structurals.
|
|
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
|
**/
|
|
simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
|
|
/** All JSON whitespace (i.e. not in a string) */
|
|
simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
|
|
|
|
// Helpers
|
|
|
|
/** Whether the given characters are inside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
|
|
/** Whether the given characters are outside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
|
|
|
|
// string and escape characters
|
|
json_string_block _string;
|
|
// whitespace, structural characters ('operators'), scalars
|
|
json_character_block _characters;
|
|
// whether the previous character was a scalar
|
|
uint64_t _follows_potential_nonquote_scalar;
|
|
private:
|
|
// Potential structurals (i.e. disregarding strings)
|
|
|
|
/**
|
|
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
|
* They may reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
|
|
/**
|
|
* The start of non-operator runs, like 123, true and "abc".
|
|
* It main reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_scalar_start() const noexcept {
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
|
// then we know that it is irrelevant structurally.
|
|
return _characters.scalar() & ~follows_potential_scalar();
|
|
}
|
|
/**
|
|
* Whether the given character is immediately after a non-operator like 123, true.
|
|
* The characters following a quote are not included.
|
|
*/
|
|
simdjson_inline uint64_t follows_potential_scalar() const noexcept {
|
|
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
|
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
|
// white space.
|
|
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
|
return _follows_potential_nonquote_scalar;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
|
*
|
|
* The scanner starts by calculating two distinct things:
|
|
* - string characters (taking \" into account)
|
|
* - structural characters or 'operators' ([]{},:, comma)
|
|
* and scalars (runs of non-operators like 123, true and "abc")
|
|
*
|
|
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
|
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
|
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
|
|
* part of a string.
|
|
*/
|
|
class json_scanner {
|
|
public:
|
|
json_scanner() = default;
|
|
simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_inline error_code finish();
|
|
|
|
private:
|
|
// Whether the last character of the previous iteration is part of a scalar token
|
|
// (anything except whitespace or a structural character/'operator').
|
|
uint64_t prev_scalar = 0ULL;
|
|
json_string_scanner string_scanner{};
|
|
};
|
|
|
|
|
|
//
|
|
// Check if the current character immediately follows a matching character.
|
|
//
|
|
// For example, this checks for quotes with backslashes in front of them:
|
|
//
|
|
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
|
//
|
|
simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|
const uint64_t result = match << 1 | overflow;
|
|
overflow = match >> 63;
|
|
return result;
|
|
}
|
|
|
|
simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
json_string_block strings = string_scanner.next(in);
|
|
// identifies the white-space and the structural characters
|
|
json_character_block characters = json_character_block::classify(in);
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
|
//
|
|
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
|
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
|
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
|
// may need to add an extra check when parsing strings.
|
|
//
|
|
// Performance: there are many ways to skin this cat.
|
|
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
|
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_block(
|
|
strings,// strings is a function-local object so either it moves or the copy is elided.
|
|
characters,
|
|
follows_nonquote_scalar
|
|
);
|
|
}
|
|
|
|
simdjson_inline error_code json_scanner::finish() {
|
|
return string_scanner.finish();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
/* end file generic/stage1/json_scanner.h for icelake */
|
|
|
|
// All other declarations
|
|
/* including generic/stage1/find_next_document_index.h for icelake: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for icelake */
|
|
/* including generic/stage1/json_minifier.h for icelake: #include <generic/stage1/json_minifier.h> */
|
|
/* begin file generic/stage1/json_minifier.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class json_minifier {
|
|
public:
|
|
template<size_t STEP_SIZE>
|
|
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_minifier(uint8_t *_dst)
|
|
: dst{_dst}
|
|
{}
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
|
|
simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
|
|
json_scanner scanner{};
|
|
uint8_t *dst;
|
|
};
|
|
|
|
simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
|
uint64_t mask = block.whitespace();
|
|
dst += in.compress(mask, dst);
|
|
}
|
|
|
|
simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
|
error_code error = scanner.finish();
|
|
if (error) { dst_len = 0; return error; }
|
|
dst_len = dst - dst_start;
|
|
return SUCCESS;
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
simd::simd8x64<uint8_t> in_2(block_buf+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1);
|
|
this->next(in_2, block_2);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(block_buf, block_1);
|
|
reader.advance();
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_minifier minifier(dst);
|
|
|
|
// Index the first n-1 blocks
|
|
while (reader.has_full_block()) {
|
|
minifier.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
|
|
// Index the last (remainder) block, padded with spaces
|
|
uint8_t block[STEP_SIZE];
|
|
size_t remaining_bytes = reader.get_remainder(block);
|
|
if (remaining_bytes > 0) {
|
|
// We do not want to write directly to the output stream. Rather, we write
|
|
// to a local buffer (for safety).
|
|
uint8_t out_block[STEP_SIZE];
|
|
uint8_t * const guarded_dst{minifier.dst};
|
|
minifier.dst = out_block;
|
|
minifier.step<STEP_SIZE>(block, reader);
|
|
size_t to_write = minifier.dst - out_block;
|
|
// In some cases, we could be enticed to consider the padded spaces
|
|
// as part of the string. This is fine as long as we do not write more
|
|
// than we consumed.
|
|
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
|
|
memcpy(guarded_dst, out_block, to_write);
|
|
minifier.dst = guarded_dst + to_write;
|
|
}
|
|
return minifier.finish(dst, dst_len);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
/* end file generic/stage1/json_minifier.h for icelake */
|
|
/* including generic/stage1/json_structural_indexer.h for icelake: #include <generic/stage1/json_structural_indexer.h> */
|
|
/* begin file generic/stage1/json_structural_indexer.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_minifier.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/find_next_document_index.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer {
|
|
public:
|
|
uint32_t *tail;
|
|
|
|
simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
/**
|
|
* ARM lacks a fast trailing zero instruction, but it has a fast
|
|
* bit reversal instruction and a fast leading zero instruction.
|
|
* Thus it may be profitable to reverse the bits (once) and then
|
|
* to rely on a sequence of instructions that call the leading
|
|
* zero instruction.
|
|
*
|
|
* Performance notes:
|
|
* The chosen routine is not optimal in terms of data dependency
|
|
* since zero_leading_bit might require two instructions. However,
|
|
* it tends to minimize the total number of instructions which is
|
|
* beneficial.
|
|
*/
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
|
|
int lz = leading_zeroes(rev_bits);
|
|
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
|
rev_bits = zero_leading_bit(rev_bits, lz);
|
|
}
|
|
#else
|
|
/**
|
|
* Under recent x64 systems, we often have both a fast trailing zero
|
|
* instruction and a fast 'clear-lower-bit' instruction so the following
|
|
* algorithm can be competitive.
|
|
*/
|
|
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
|
|
this->tail[i] = idx + trailing_zeroes(bits);
|
|
bits = clear_lowest_bit(bits);
|
|
}
|
|
#endif // SIMDJSON_PREFER_REVERSE_BITS
|
|
|
|
template <int START, int N>
|
|
simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
|
|
write_index(idx, bits, START);
|
|
SIMDJSON_IF_CONSTEXPR (N > 1) {
|
|
write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
|
|
}
|
|
return START+N;
|
|
}
|
|
|
|
template <int START, int END, int STEP>
|
|
simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
|
|
write_indexes<START, STEP>(idx, bits);
|
|
SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) {
|
|
if (simdjson_unlikely((START+STEP) < cnt)) {
|
|
write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
|
|
}
|
|
}
|
|
return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
|
|
}
|
|
|
|
// flatten out values in 'bits' assuming that they are are to have values of idx
|
|
// plus their position in the bitvector, and store these indexes at
|
|
// base_ptr[base] incrementing base as we go
|
|
// will potentially store extra values beyond end of valid bits, so base_ptr
|
|
// needs to be large enough to handle this
|
|
//
|
|
// If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it
|
|
// will provide its own version of the code.
|
|
#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits);
|
|
#else
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0)
|
|
return;
|
|
|
|
int cnt = static_cast<int>(count_ones(bits));
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
bits = reverse_bits(bits);
|
|
#endif
|
|
#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
|
|
static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
|
|
#else
|
|
static constexpr const int STEP = 4;
|
|
#endif
|
|
static constexpr const int STEP_UNTIL = 24;
|
|
|
|
write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
|
|
SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
|
|
if (simdjson_unlikely(STEP_UNTIL < cnt)) {
|
|
for (int i=STEP_UNTIL; i<cnt; i++) {
|
|
write_index(idx, bits, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
this->tail += cnt;
|
|
}
|
|
#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
};
|
|
|
|
class json_structural_indexer {
|
|
public:
|
|
/**
|
|
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
|
|
*
|
|
* @param partial Setting the partial parameter to true allows the find_structural_bits to
|
|
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
|
|
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
|
*/
|
|
template<size_t STEP_SIZE>
|
|
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
|
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
|
|
|
json_scanner scanner{};
|
|
utf8_checker checker{};
|
|
bit_indexer indexer;
|
|
uint64_t prev_structurals = 0;
|
|
uint64_t unescaped_chars_error = 0;
|
|
};
|
|
|
|
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
|
|
|
|
// Skip the last character if it is partial
|
|
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
|
|
if (simdjson_unlikely(len < 3)) {
|
|
switch (len) {
|
|
case 2:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
|
|
return len;
|
|
case 1:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return len;
|
|
case 0:
|
|
return len;
|
|
}
|
|
}
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// PERF NOTES:
|
|
// We pipe 2 inputs through these stages:
|
|
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
|
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
|
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
|
|
// The output of step 1 depends entirely on this information. These functions don't quite use
|
|
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
|
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
|
// they can make a lot of progress before they need that information.
|
|
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
|
|
// to finish: utf-8 checks and generating the output from the last iteration.
|
|
//
|
|
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
|
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
|
// workout.
|
|
//
|
|
template<size_t STEP_SIZE>
|
|
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
|
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
|
// We guard the rest of the code so that we can assume that len > 0 throughout.
|
|
if (len == 0) { return EMPTY; }
|
|
if (is_streaming(partial)) {
|
|
len = trim_partial_utf8(buf, len);
|
|
// If you end up with an empty window after trimming
|
|
// the partial UTF-8 bytes, then chances are good that you
|
|
// have an UTF-8 formatting error.
|
|
if(len == 0) { return UTF8_ERROR; }
|
|
}
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_structural_indexer indexer(parser.structural_indexes.get());
|
|
|
|
// Read all but the last block
|
|
while (reader.has_full_block()) {
|
|
indexer.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
// Take care of the last block (will always be there unless file is empty which is
|
|
// not supposed to happen.)
|
|
uint8_t block[STEP_SIZE];
|
|
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
|
|
indexer.step<STEP_SIZE>(block, reader);
|
|
return indexer.finish(parser, reader.block_index(), len, partial);
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
simd::simd8x64<uint8_t> in_2(block+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
this->next(in_2, block_2, reader.block_index()+64);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
reader.advance();
|
|
}
|
|
|
|
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
|
|
uint64_t unescaped = in.lteq(0x1F);
|
|
#if SIMDJSON_UTF8VALIDATION
|
|
checker.check_next_input(in);
|
|
#endif
|
|
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
|
|
prev_structurals = block.structural_start();
|
|
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
|
}
|
|
|
|
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
|
// Write out the final iteration's structurals
|
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
|
error_code error = scanner.finish();
|
|
// We deliberately break down the next expression so that it is
|
|
// human readable.
|
|
const bool should_we_exit = is_streaming(partial) ?
|
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
|
|
|
if (unescaped_chars_error) {
|
|
return UNESCAPED_CHARS;
|
|
}
|
|
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
|
/***
|
|
* The On Demand API requires special padding.
|
|
*
|
|
* This is related to https://github.com/simdjson/simdjson/issues/906
|
|
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
|
* structural character, it quickly stops.
|
|
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
|
|
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
|
|
* continues, then it must be [,] or }.
|
|
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
|
|
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
|
|
* It can't be a comma, a colon or any simple value. So the only way we could continue is
|
|
* if the repeated character is [. But if so, the document must start with [. But if the document
|
|
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
|
* ][[ which is invalid.
|
|
*
|
|
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
|
* R"({ "a": [,,)"
|
|
**/
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
|
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
|
parser.next_structural_index = 0;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
return EMPTY;
|
|
}
|
|
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
|
return UNEXPECTED_ERROR;
|
|
}
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
// If we have an unclosed string, then the last structural
|
|
// will be the quote and we want to make sure to omit it.
|
|
if(have_unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if (partial == stage1_mode::streaming_final) {
|
|
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
|
// often load their data in bulk without being careful and they want us to ignore
|
|
// the trailing garbage.
|
|
return EMPTY;
|
|
}
|
|
}
|
|
checker.check_eof();
|
|
return checker.errors();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
/* end file generic/stage1/json_structural_indexer.h for icelake */
|
|
/* including generic/stage1/utf8_validator.h for icelake: #include <generic/stage1/utf8_validator.h> */
|
|
/* begin file generic/stage1/utf8_validator.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return c.errors() == error_code::SUCCESS;
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
/* end file generic/stage1/utf8_validator.h for icelake */
|
|
/* end file generic/stage1/amalgamated.h for icelake */
|
|
/* including generic/stage2/amalgamated.h for icelake: #include <generic/stage2/amalgamated.h> */
|
|
/* begin file generic/stage2/amalgamated.h for icelake */
|
|
// Stuff other things depend on
|
|
/* including generic/stage2/base.h for icelake: #include <generic/stage2/base.h> */
|
|
/* begin file generic/stage2/base.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator;
|
|
class structural_iterator;
|
|
struct tape_builder;
|
|
struct tape_writer;
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
/* end file generic/stage2/base.h for icelake */
|
|
/* including generic/stage2/tape_writer.h for icelake: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for icelake */
|
|
/* including generic/stage2/logger.h for icelake: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for icelake */
|
|
|
|
// All other declarations
|
|
/* including generic/stage2/json_iterator.h for icelake: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for icelake */
|
|
/* including generic/stage2/stringparsing.h for icelake: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for icelake */
|
|
/* including generic/stage2/structural_iterator.h for icelake: #include <generic/stage2/structural_iterator.h> */
|
|
/* begin file generic/stage2/structural_iterator.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class structural_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
|
|
// Start a structural
|
|
simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
// Get the buffer position of the current structural character
|
|
simdjson_inline const uint8_t* current() {
|
|
return &buf[*(next_structural-1)];
|
|
}
|
|
// Get the current structural character
|
|
simdjson_inline char current_char() {
|
|
return buf[*(next_structural-1)];
|
|
}
|
|
// Get the next structural character without advancing
|
|
simdjson_inline char peek_next_char() {
|
|
return buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* peek() {
|
|
return &buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* advance() {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline char advance_char() {
|
|
return buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t remaining_len() {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool at_end() {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool at_beginning() {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
};
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
/* end file generic/stage2/structural_iterator.h for icelake */
|
|
/* including generic/stage2/tape_builder.h for icelake: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for icelake */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for icelake */
|
|
/* end file generic/stage2/amalgamated.h for icelake */
|
|
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// This identifies structural characters (comma, colon, braces, brackets),
|
|
// and ASCII white-space ('\r','\n','\t',' ').
|
|
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
|
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
|
|
// we can't use the generic lookup_16.
|
|
const auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
|
|
|
|
// The 6 operators (:,[]{}) have these values:
|
|
//
|
|
// , 2C
|
|
// : 3A
|
|
// [ 5B
|
|
// { 7B
|
|
// ] 5D
|
|
// } 7D
|
|
//
|
|
// If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
|
|
// We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
|
|
// match it (against | 0x20).
|
|
//
|
|
// To prevent recognizing other characters, everything else gets compared with 0, which cannot
|
|
// match due to the | 0x20.
|
|
//
|
|
// NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
|
|
// and :. This gets caught in stage 2, which checks the actual character to ensure the right
|
|
// operators are in the right places.
|
|
const auto op_table = simd8<uint8_t>::repeat_16(
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0,
|
|
0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
|
|
',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D
|
|
);
|
|
|
|
// We compute whitespace and op separately. If later code only uses one or the
|
|
// other, given the fact that all functions are aggressively inlined, we can
|
|
// hope that useless computations will be omitted. This is namely case when
|
|
// minifying (we only need whitespace).
|
|
|
|
const uint64_t whitespace = in.eq({
|
|
_mm512_shuffle_epi8(whitespace_table, in.chunks[0])
|
|
});
|
|
// Turn [ and ] into { and }
|
|
const simd8x64<uint8_t> curlified{
|
|
in.chunks[0] | 0x20
|
|
};
|
|
const uint64_t op = curlified.eq({
|
|
_mm512_shuffle_epi8(op_table, in.chunks[0])
|
|
});
|
|
|
|
return { whitespace, op };
|
|
}
|
|
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
|
|
return is_third_byte | is_fourth_byte;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
/**
|
|
* We provide a custom version of bit_indexer::write using
|
|
* naked intrinsics.
|
|
* TODO: make this code more elegant.
|
|
*/
|
|
// Under GCC 12, the intrinsic _mm512_extracti32x4_epi32 may generate 'maybe uninitialized'.
|
|
// as a workaround, we disable warnings within the following function.
|
|
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
|
|
namespace simdjson { namespace icelake { namespace { namespace stage1 {
|
|
simdjson_inline void bit_indexer::write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0) { return; }
|
|
|
|
const __m512i indexes = _mm512_maskz_compress_epi8(bits, _mm512_set_epi32(
|
|
0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130,
|
|
0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120,
|
|
0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110,
|
|
0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100
|
|
));
|
|
const __m512i start_index = _mm512_set1_epi32(idx);
|
|
|
|
const auto count = count_ones(bits);
|
|
__m512i t0 = _mm512_cvtepu8_epi32(_mm512_castsi512_si128(indexes));
|
|
_mm512_storeu_si512(this->tail, _mm512_add_epi32(t0, start_index));
|
|
|
|
if(count > 16) {
|
|
const __m512i t1 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 1));
|
|
_mm512_storeu_si512(this->tail + 16, _mm512_add_epi32(t1, start_index));
|
|
if(count > 32) {
|
|
const __m512i t2 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 2));
|
|
_mm512_storeu_si512(this->tail + 32, _mm512_add_epi32(t2, start_index));
|
|
if(count > 48) {
|
|
const __m512i t3 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 3));
|
|
_mm512_storeu_si512(this->tail + 48, _mm512_add_epi32(t3, start_index));
|
|
}
|
|
}
|
|
}
|
|
this->tail += count;
|
|
}
|
|
}}}}
|
|
SIMDJSON_POP_DISABLE_WARNINGS
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdjson {
|
|
namespace icelake {
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
return icelake::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
return icelake::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
|
|
}
|
|
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return icelake::stage1::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
|
|
return icelake::stringparsing::parse_string(src, dst, replacement_char);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return icelake::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace icelake
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/icelake/end.h: #include <simdjson/icelake/end.h> */
|
|
/* begin file simdjson/icelake/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
|
|
SIMDJSON_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "icelake" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/icelake/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_ICELAKE_CPP
|
|
/* end file icelake.cpp */
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_PPC64
|
|
/* including ppc64.cpp: #include <ppc64.cpp> */
|
|
/* begin file ppc64.cpp */
|
|
#ifndef SIMDJSON_SRC_PPC64_CPP
|
|
#define SIMDJSON_SRC_PPC64_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/ppc64.h: #include <simdjson/ppc64.h> */
|
|
/* begin file simdjson/ppc64.h */
|
|
#ifndef SIMDJSON_PPC64_H
|
|
#define SIMDJSON_PPC64_H
|
|
|
|
/* including simdjson/ppc64/begin.h: #include "simdjson/ppc64/begin.h" */
|
|
/* begin file simdjson/ppc64/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "ppc64" */
|
|
#define SIMDJSON_IMPLEMENTATION ppc64
|
|
/* including simdjson/ppc64/base.h: #include "simdjson/ppc64/base.h" */
|
|
/* begin file simdjson/ppc64/base.h */
|
|
#ifndef SIMDJSON_PPC64_BASE_H
|
|
#define SIMDJSON_PPC64_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for ALTIVEC (PPC64).
|
|
*/
|
|
namespace ppc64 {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_BASE_H
|
|
/* end file simdjson/ppc64/base.h */
|
|
/* including simdjson/ppc64/intrinsics.h: #include "simdjson/ppc64/intrinsics.h" */
|
|
/* begin file simdjson/ppc64/intrinsics.h */
|
|
#ifndef SIMDJSON_PPC64_INTRINSICS_H
|
|
#define SIMDJSON_PPC64_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <altivec.h>
|
|
|
|
// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
|
|
#ifdef bool
|
|
#undef bool
|
|
#endif
|
|
|
|
#ifdef vector
|
|
#undef vector
|
|
#endif
|
|
|
|
static_assert(sizeof(__vector unsigned char) <= simdjson::SIMDJSON_PADDING, "insufficient padding for ppc64");
|
|
|
|
#endif // SIMDJSON_PPC64_INTRINSICS_H
|
|
/* end file simdjson/ppc64/intrinsics.h */
|
|
/* including simdjson/ppc64/bitmanipulation.h: #include "simdjson/ppc64/bitmanipulation.h" */
|
|
/* begin file simdjson/ppc64/bitmanipulation.h */
|
|
#ifndef SIMDJSON_PPC64_BITMANIPULATION_H
|
|
#define SIMDJSON_PPC64_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num - 1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num); // Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return __builtin_popcountll(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
*result = value1 + value2;
|
|
return *result < value1;
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_BITMANIPULATION_H
|
|
/* end file simdjson/ppc64/bitmanipulation.h */
|
|
/* including simdjson/ppc64/bitmask.h: #include "simdjson/ppc64/bitmask.h" */
|
|
/* begin file simdjson/ppc64/bitmask.h */
|
|
#ifndef SIMDJSON_PPC64_BITMASK_H
|
|
#define SIMDJSON_PPC64_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is
|
|
// encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
// You can use the version below, however gcc sometimes miscompiles
|
|
// vec_pmsum_be, it happens somewhere around between 8 and 9th version.
|
|
// The performance boost was not noticeable, falling back to a usual
|
|
// implementation.
|
|
// __vector unsigned long long all_ones = {~0ull, ~0ull};
|
|
// __vector unsigned long long mask = {bitmask, 0};
|
|
// // Clang and GCC return different values for pmsum for ull so cast it to one.
|
|
// // Generally it is not specified by ALTIVEC ISA what is returned by
|
|
// // vec_pmsum_be.
|
|
// #if defined(__LITTLE_ENDIAN__)
|
|
// return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[0]);
|
|
// #else
|
|
// return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[1]);
|
|
// #endif
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/ppc64/bitmask.h */
|
|
/* including simdjson/ppc64/numberparsing_defs.h: #include "simdjson/ppc64/numberparsing_defs.h" */
|
|
/* begin file simdjson/ppc64/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_PPC64_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_PPC64_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
#if defined(__linux__)
|
|
#include <byteswap.h>
|
|
#elif defined(__FreeBSD__)
|
|
#include <sys/endian.h>
|
|
#endif
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace numberparsing {
|
|
|
|
// we don't have appropriate instructions, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
#ifdef __BIG_ENDIAN__
|
|
#if defined(__linux__)
|
|
val = bswap_64(val);
|
|
#elif defined(__FreeBSD__)
|
|
val = bswap64(val);
|
|
#endif
|
|
#endif
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_PPC64_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/ppc64/numberparsing_defs.h */
|
|
/* including simdjson/ppc64/simd.h: #include "simdjson/ppc64/simd.h" */
|
|
/* begin file simdjson/ppc64/simd.h */
|
|
#ifndef SIMDJSON_PPC64_SIMD_H
|
|
#define SIMDJSON_PPC64_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <type_traits>
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
using __m128i = __vector unsigned char;
|
|
|
|
template <typename Child> struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i &() const {
|
|
return this->value;
|
|
}
|
|
simdjson_inline operator __m128i &() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const {
|
|
return vec_or(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child operator&(const Child other) const {
|
|
return vec_and(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child operator^(const Child other) const {
|
|
return vec_xor(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child bit_andnot(const Child other) const {
|
|
return vec_andc(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child &operator|=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
simdjson_inline Child &operator&=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast & other;
|
|
return *this_cast;
|
|
}
|
|
simdjson_inline Child &operator^=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast ^ other;
|
|
return *this_cast;
|
|
}
|
|
};
|
|
|
|
template <typename T, typename Mask = simd8<bool>>
|
|
struct base8 : base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
|
|
return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template <int N = 1>
|
|
simdjson_inline simd8<T> prev(simd8<T> prev_chunk) const {
|
|
__m128i chunk = this->value;
|
|
#ifdef __LITTLE_ENDIAN__
|
|
chunk = (__m128i)vec_reve(this->value);
|
|
prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
|
|
#endif
|
|
chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
|
|
#ifdef __LITTLE_ENDIAN__
|
|
chunk = (__m128i)vec_reve((__m128i)chunk);
|
|
#endif
|
|
return chunk;
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) {
|
|
return (__m128i)vec_splats((unsigned char)(-(!!_value)));
|
|
}
|
|
|
|
simdjson_inline simd8() : base8<bool>() {}
|
|
simdjson_inline simd8(const __m128i _value)
|
|
: base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value)
|
|
: base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const {
|
|
__vector unsigned long long result;
|
|
const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
|
|
0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
|
|
|
|
result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
|
|
(__m128i)perm_mask));
|
|
#ifdef __LITTLE_ENDIAN__
|
|
return static_cast<int>(result[1]);
|
|
#else
|
|
return static_cast<int>(result[0]);
|
|
#endif
|
|
}
|
|
simdjson_inline bool any() const {
|
|
return !vec_all_eq(this->value, (__m128i)vec_splats(0));
|
|
}
|
|
simdjson_inline simd8<bool> operator~() const {
|
|
return this->value ^ (__m128i)splat(true);
|
|
}
|
|
};
|
|
|
|
template <typename T> struct base8_numeric : base8<T> {
|
|
static simdjson_inline simd8<T> splat(T value) {
|
|
(void)value;
|
|
return (__m128i)vec_splats(value);
|
|
}
|
|
static simdjson_inline simd8<T> zero() { return splat(0); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
|
|
T v5, T v6, T v7, T v8, T v9,
|
|
T v10, T v11, T v12, T v13,
|
|
T v14, T v15) {
|
|
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
|
|
v14, v15);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value)
|
|
: base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const {
|
|
vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const {
|
|
return (__m128i)((__m128i)this->value + (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const {
|
|
return (__m128i)((__m128i)this->value - (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<T> &operator+=(const simd8<T> other) {
|
|
*this = *this + other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
simdjson_inline simd8<T> &operator-=(const simd8<T> other) {
|
|
*this = *this - other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted
|
|
// as a bitset). Passing a 0 value for mask would be equivalent to writing out
|
|
// every byte to output. Only the first 16 - count_ones(mask) bytes of the
|
|
// result are significant but 16 bytes get written. Design consideration: it
|
|
// seems like a function with the signature simd8<L> compress(uint32_t mask)
|
|
// would be sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template <typename L>
|
|
simdjson_inline void compress(uint16_t mask, L *output) const {
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
using internal::thintable_epi8;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__m128i shufmask = (__m128i)(__vector unsigned long long){
|
|
thintable_epi8[mask1], thintable_epi8[mask2]};
|
|
#else
|
|
__m128i shufmask = (__m128i)(__vector unsigned long long){
|
|
thintable_epi8[mask2], thintable_epi8[mask1]};
|
|
shufmask = (__m128i)vec_reve((__m128i)shufmask);
|
|
#endif
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask = ((__m128i)shufmask) +
|
|
((__m128i)(__vector int){0, 0, 0x08080808, 0x08080808});
|
|
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = vec_perm(this->value, this->value, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = vec_perm(pruned, (__m128i)vec_splats(0), compactmask);
|
|
vec_vsx_st(answer, 0, reinterpret_cast<__m128i *>(output));
|
|
}
|
|
|
|
template <typename L>
|
|
simdjson_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value)
|
|
: base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
|
|
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
|
|
: simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10, v11, v12, v13, v14,
|
|
v15}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t>
|
|
repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
|
|
int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
|
|
return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t>
|
|
max_val(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_max((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdjson_inline simd8<int8_t>
|
|
min_val(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_min((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator>(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_cmpgt((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator<(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_cmplt((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value)
|
|
: base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t>
|
|
repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
|
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
|
|
uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
|
|
uint8_t v15) {
|
|
return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t>
|
|
saturating_add(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_adds(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<uint8_t>
|
|
saturating_sub(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_subs(this->value, (__m128i)other);
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t>
|
|
max_val(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_max(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<uint8_t>
|
|
min_val(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_min(this->value, (__m128i)other);
|
|
}
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t>
|
|
lt_bits(const simd8<uint8_t> other) const {
|
|
return other.saturating_sub(*this);
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator<=(const simd8<uint8_t> other) const {
|
|
return other.max_val(*this) == other;
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator>(const simd8<uint8_t> other) const {
|
|
return this->gt_bits(other).any_bits_set();
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator<(const simd8<uint8_t> other) const {
|
|
return this->gt_bits(other).any_bits_set();
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const {
|
|
return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
|
|
}
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
|
|
return (*this & bits).bits_not_set();
|
|
}
|
|
simdjson_inline simd8<bool> any_bits_set() const {
|
|
return ~this->bits_not_set();
|
|
}
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
|
|
return ~this->bits_not_set(bits);
|
|
}
|
|
simdjson_inline bool bits_not_set_anywhere() const {
|
|
return vec_all_eq(this->value, (__m128i)vec_splats(0));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere() const {
|
|
return !bits_not_set_anywhere();
|
|
}
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
|
|
return vec_all_eq(vec_and(this->value, (__m128i)bits),
|
|
(__m128i)vec_splats(0));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
|
|
return !bits_not_set_anywhere(bits);
|
|
}
|
|
template <int N> simdjson_inline simd8<uint8_t> shr() const {
|
|
return simd8<uint8_t>(
|
|
(__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
|
|
}
|
|
template <int N> simdjson_inline simd8<uint8_t> shl() const {
|
|
return simd8<uint8_t>(
|
|
(__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"PPC64 kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
|
|
const simd8<T> chunk2, const simd8<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64])
|
|
: chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16),
|
|
simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
|
|
this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
|
|
this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T *output) const {
|
|
this->chunks[0].compress(uint16_t(mask), output);
|
|
this->chunks[1].compress(uint16_t(mask >> 16),
|
|
output + 16 - count_ones(mask & 0xFFFF));
|
|
this->chunks[2].compress(uint16_t(mask >> 32),
|
|
output + 32 - count_ones(mask & 0xFFFFFFFF));
|
|
this->chunks[3].compress(uint16_t(mask >> 48),
|
|
output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r1 = this->chunks[1].to_bitmask();
|
|
uint64_t r2 = this->chunks[2].to_bitmask();
|
|
uint64_t r3 = this->chunks[3].to_bitmask();
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
|
|
this->chunks[2] == mask, this->chunks[3] == mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3])
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask, this->chunks[3] <= mask)
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_SIMD_INPUT_H
|
|
/* end file simdjson/ppc64/simd.h */
|
|
/* including simdjson/ppc64/stringparsing_defs.h: #include "simdjson/ppc64/stringparsing_defs.h" */
|
|
/* begin file simdjson/ppc64/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_PPC64_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_PPC64_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/simd.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote
|
|
copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() {
|
|
return ((bs_bits - 1) & quote_bits) != 0;
|
|
}
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() {
|
|
return trailing_zeroes(quote_bits);
|
|
}
|
|
simdjson_inline int backslash_index() {
|
|
return trailing_zeroes(bs_bits);
|
|
}
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote
|
|
backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1),
|
|
"backslash and quote finder must process fewer than "
|
|
"SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + sizeof(v0));
|
|
v0.store(dst);
|
|
v1.store(dst + sizeof(v0));
|
|
|
|
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on
|
|
// PPC; therefore, we smash them together into a 64-byte mask and get the
|
|
// bitmask from there.
|
|
uint64_t bs_and_quote =
|
|
simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/ppc64/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/ppc64/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for ppc64: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for ppc64 */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for ppc64: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for ppc64 */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for ppc64 */
|
|
/* including simdjson/generic/jsoncharutils.h for ppc64: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for ppc64 */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for ppc64 */
|
|
/* including simdjson/generic/atomparsing.h for ppc64: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for ppc64 */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for ppc64 */
|
|
/* including simdjson/generic/dom_parser_implementation.h for ppc64: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for ppc64 */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for ppc64 */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for ppc64: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for ppc64 */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for ppc64 */
|
|
/* including simdjson/generic/numberparsing.h for ppc64: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for ppc64 */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for ppc64 */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for ppc64: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for ppc64 */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for ppc64 */
|
|
/* end file simdjson/generic/amalgamated.h for ppc64 */
|
|
/* including simdjson/ppc64/end.h: #include "simdjson/ppc64/end.h" */
|
|
/* begin file simdjson/ppc64/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "ppc64" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/ppc64/end.h */
|
|
|
|
#endif // SIMDJSON_PPC64_H
|
|
/* end file simdjson/ppc64.h */
|
|
/* including simdjson/ppc64/implementation.h: #include <simdjson/ppc64/implementation.h> */
|
|
/* begin file simdjson/ppc64/implementation.h */
|
|
#ifndef SIMDJSON_PPC64_IMPLEMENTATION_H
|
|
#define SIMDJSON_PPC64_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
|
|
/**
|
|
* Implementation for ALTIVEC (PPC64).
|
|
*/
|
|
namespace ppc64 {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation()
|
|
: simdjson::implementation("ppc64", "PPC64 ALTIVEC",
|
|
internal::instruction_set::ALTIVEC) {}
|
|
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity, size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation> &dst)
|
|
const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len,
|
|
uint8_t *dst,
|
|
size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf,
|
|
size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_IMPLEMENTATION_H
|
|
/* end file simdjson/ppc64/implementation.h */
|
|
|
|
/* including simdjson/ppc64/begin.h: #include <simdjson/ppc64/begin.h> */
|
|
/* begin file simdjson/ppc64/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "ppc64" */
|
|
#define SIMDJSON_IMPLEMENTATION ppc64
|
|
/* including simdjson/ppc64/base.h: #include "simdjson/ppc64/base.h" */
|
|
/* begin file simdjson/ppc64/base.h */
|
|
#ifndef SIMDJSON_PPC64_BASE_H
|
|
#define SIMDJSON_PPC64_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for ALTIVEC (PPC64).
|
|
*/
|
|
namespace ppc64 {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_BASE_H
|
|
/* end file simdjson/ppc64/base.h */
|
|
/* including simdjson/ppc64/intrinsics.h: #include "simdjson/ppc64/intrinsics.h" */
|
|
/* begin file simdjson/ppc64/intrinsics.h */
|
|
#ifndef SIMDJSON_PPC64_INTRINSICS_H
|
|
#define SIMDJSON_PPC64_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <altivec.h>
|
|
|
|
// These are defined by altivec.h in GCC toolchain, it is safe to undef them.
|
|
#ifdef bool
|
|
#undef bool
|
|
#endif
|
|
|
|
#ifdef vector
|
|
#undef vector
|
|
#endif
|
|
|
|
static_assert(sizeof(__vector unsigned char) <= simdjson::SIMDJSON_PADDING, "insufficient padding for ppc64");
|
|
|
|
#endif // SIMDJSON_PPC64_INTRINSICS_H
|
|
/* end file simdjson/ppc64/intrinsics.h */
|
|
/* including simdjson/ppc64/bitmanipulation.h: #include "simdjson/ppc64/bitmanipulation.h" */
|
|
/* begin file simdjson/ppc64/bitmanipulation.h */
|
|
#ifndef SIMDJSON_PPC64_BITMANIPULATION_H
|
|
#define SIMDJSON_PPC64_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num - 1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num); // Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return __builtin_popcountll(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
*result = value1 + value2;
|
|
return *result < value1;
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_BITMANIPULATION_H
|
|
/* end file simdjson/ppc64/bitmanipulation.h */
|
|
/* including simdjson/ppc64/bitmask.h: #include "simdjson/ppc64/bitmask.h" */
|
|
/* begin file simdjson/ppc64/bitmask.h */
|
|
#ifndef SIMDJSON_PPC64_BITMASK_H
|
|
#define SIMDJSON_PPC64_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is
|
|
// encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
// You can use the version below, however gcc sometimes miscompiles
|
|
// vec_pmsum_be, it happens somewhere around between 8 and 9th version.
|
|
// The performance boost was not noticeable, falling back to a usual
|
|
// implementation.
|
|
// __vector unsigned long long all_ones = {~0ull, ~0ull};
|
|
// __vector unsigned long long mask = {bitmask, 0};
|
|
// // Clang and GCC return different values for pmsum for ull so cast it to one.
|
|
// // Generally it is not specified by ALTIVEC ISA what is returned by
|
|
// // vec_pmsum_be.
|
|
// #if defined(__LITTLE_ENDIAN__)
|
|
// return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[0]);
|
|
// #else
|
|
// return (uint64_t)(((__vector unsigned long long)vec_pmsum_be(all_ones, mask))[1]);
|
|
// #endif
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/ppc64/bitmask.h */
|
|
/* including simdjson/ppc64/numberparsing_defs.h: #include "simdjson/ppc64/numberparsing_defs.h" */
|
|
/* begin file simdjson/ppc64/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_PPC64_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_PPC64_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
#if defined(__linux__)
|
|
#include <byteswap.h>
|
|
#elif defined(__FreeBSD__)
|
|
#include <sys/endian.h>
|
|
#endif
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace numberparsing {
|
|
|
|
// we don't have appropriate instructions, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
#ifdef __BIG_ENDIAN__
|
|
#if defined(__linux__)
|
|
val = bswap_64(val);
|
|
#elif defined(__FreeBSD__)
|
|
val = bswap64(val);
|
|
#endif
|
|
#endif
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_PPC64_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/ppc64/numberparsing_defs.h */
|
|
/* including simdjson/ppc64/simd.h: #include "simdjson/ppc64/simd.h" */
|
|
/* begin file simdjson/ppc64/simd.h */
|
|
#ifndef SIMDJSON_PPC64_SIMD_H
|
|
#define SIMDJSON_PPC64_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <type_traits>
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
using __m128i = __vector unsigned char;
|
|
|
|
template <typename Child> struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i &() const {
|
|
return this->value;
|
|
}
|
|
simdjson_inline operator __m128i &() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const {
|
|
return vec_or(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child operator&(const Child other) const {
|
|
return vec_and(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child operator^(const Child other) const {
|
|
return vec_xor(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child bit_andnot(const Child other) const {
|
|
return vec_andc(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline Child &operator|=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast | other;
|
|
return *this_cast;
|
|
}
|
|
simdjson_inline Child &operator&=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast & other;
|
|
return *this_cast;
|
|
}
|
|
simdjson_inline Child &operator^=(const Child other) {
|
|
auto this_cast = static_cast<Child*>(this);
|
|
*this_cast = *this_cast ^ other;
|
|
return *this_cast;
|
|
}
|
|
};
|
|
|
|
template <typename T, typename Mask = simd8<bool>>
|
|
struct base8 : base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) {
|
|
return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
|
|
}
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template <int N = 1>
|
|
simdjson_inline simd8<T> prev(simd8<T> prev_chunk) const {
|
|
__m128i chunk = this->value;
|
|
#ifdef __LITTLE_ENDIAN__
|
|
chunk = (__m128i)vec_reve(this->value);
|
|
prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
|
|
#endif
|
|
chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
|
|
#ifdef __LITTLE_ENDIAN__
|
|
chunk = (__m128i)vec_reve((__m128i)chunk);
|
|
#endif
|
|
return chunk;
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template <> struct simd8<bool> : base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) {
|
|
return (__m128i)vec_splats((unsigned char)(-(!!_value)));
|
|
}
|
|
|
|
simdjson_inline simd8() : base8<bool>() {}
|
|
simdjson_inline simd8(const __m128i _value)
|
|
: base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value)
|
|
: base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const {
|
|
__vector unsigned long long result;
|
|
const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
|
|
0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
|
|
|
|
result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
|
|
(__m128i)perm_mask));
|
|
#ifdef __LITTLE_ENDIAN__
|
|
return static_cast<int>(result[1]);
|
|
#else
|
|
return static_cast<int>(result[0]);
|
|
#endif
|
|
}
|
|
simdjson_inline bool any() const {
|
|
return !vec_all_eq(this->value, (__m128i)vec_splats(0));
|
|
}
|
|
simdjson_inline simd8<bool> operator~() const {
|
|
return this->value ^ (__m128i)splat(true);
|
|
}
|
|
};
|
|
|
|
template <typename T> struct base8_numeric : base8<T> {
|
|
static simdjson_inline simd8<T> splat(T value) {
|
|
(void)value;
|
|
return (__m128i)vec_splats(value);
|
|
}
|
|
static simdjson_inline simd8<T> zero() { return splat(0); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
|
|
T v5, T v6, T v7, T v8, T v9,
|
|
T v10, T v11, T v12, T v13,
|
|
T v14, T v15) {
|
|
return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
|
|
v14, v15);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value)
|
|
: base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const {
|
|
vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
|
|
}
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const {
|
|
return (__m128i)((__m128i)this->value + (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const {
|
|
return (__m128i)((__m128i)this->value - (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<T> &operator+=(const simd8<T> other) {
|
|
*this = *this + other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
simdjson_inline simd8<T> &operator-=(const simd8<T> other) {
|
|
*this = *this - other;
|
|
return *static_cast<simd8<T> *>(this);
|
|
}
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior
|
|
// for out of range values)
|
|
template <typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted
|
|
// as a bitset). Passing a 0 value for mask would be equivalent to writing out
|
|
// every byte to output. Only the first 16 - count_ones(mask) bytes of the
|
|
// result are significant but 16 bytes get written. Design consideration: it
|
|
// seems like a function with the signature simd8<L> compress(uint32_t mask)
|
|
// would be sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template <typename L>
|
|
simdjson_inline void compress(uint16_t mask, L *output) const {
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
using internal::thintable_epi8;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
#ifdef __LITTLE_ENDIAN__
|
|
__m128i shufmask = (__m128i)(__vector unsigned long long){
|
|
thintable_epi8[mask1], thintable_epi8[mask2]};
|
|
#else
|
|
__m128i shufmask = (__m128i)(__vector unsigned long long){
|
|
thintable_epi8[mask2], thintable_epi8[mask1]};
|
|
shufmask = (__m128i)vec_reve((__m128i)shufmask);
|
|
#endif
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask = ((__m128i)shufmask) +
|
|
((__m128i)(__vector int){0, 0, 0x08080808, 0x08080808});
|
|
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = vec_perm(this->value, this->value, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = vec_perm(pruned, (__m128i)vec_splats(0), compactmask);
|
|
vec_vsx_st(answer, 0, reinterpret_cast<__m128i *>(output));
|
|
}
|
|
|
|
template <typename L>
|
|
simdjson_inline simd8<L>
|
|
lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
|
|
L replace5, L replace6, L replace7, L replace8, L replace9,
|
|
L replace10, L replace11, L replace12, L replace13, L replace14,
|
|
L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3, replace4, replace5, replace6,
|
|
replace7, replace8, replace9, replace10, replace11, replace12,
|
|
replace13, replace14, replace15));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template <> struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value)
|
|
: base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
|
|
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
|
|
: simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10, v11, v12, v13, v14,
|
|
v15}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t>
|
|
repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
|
|
int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
|
|
return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t>
|
|
max_val(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_max((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdjson_inline simd8<int8_t>
|
|
min_val(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_min((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator>(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_cmpgt((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator<(const simd8<int8_t> other) const {
|
|
return (__m128i)vec_cmplt((__vector signed char)this->value,
|
|
(__vector signed char)(__m128i)other);
|
|
}
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value)
|
|
: base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t *values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline
|
|
simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
|
|
uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
|
|
uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
|
|
: simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t>
|
|
repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
|
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
|
|
uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
|
|
uint8_t v15) {
|
|
return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
|
|
v13, v14, v15);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t>
|
|
saturating_add(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_adds(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<uint8_t>
|
|
saturating_sub(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_subs(this->value, (__m128i)other);
|
|
}
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t>
|
|
max_val(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_max(this->value, (__m128i)other);
|
|
}
|
|
simdjson_inline simd8<uint8_t>
|
|
min_val(const simd8<uint8_t> other) const {
|
|
return (__m128i)vec_min(this->value, (__m128i)other);
|
|
}
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t>
|
|
gt_bits(const simd8<uint8_t> other) const {
|
|
return this->saturating_sub(other);
|
|
}
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t>
|
|
lt_bits(const simd8<uint8_t> other) const {
|
|
return other.saturating_sub(*this);
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator<=(const simd8<uint8_t> other) const {
|
|
return other.max_val(*this) == other;
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator>=(const simd8<uint8_t> other) const {
|
|
return other.min_val(*this) == other;
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator>(const simd8<uint8_t> other) const {
|
|
return this->gt_bits(other).any_bits_set();
|
|
}
|
|
simdjson_inline simd8<bool>
|
|
operator<(const simd8<uint8_t> other) const {
|
|
return this->gt_bits(other).any_bits_set();
|
|
}
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const {
|
|
return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
|
|
}
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
|
|
return (*this & bits).bits_not_set();
|
|
}
|
|
simdjson_inline simd8<bool> any_bits_set() const {
|
|
return ~this->bits_not_set();
|
|
}
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
|
|
return ~this->bits_not_set(bits);
|
|
}
|
|
simdjson_inline bool bits_not_set_anywhere() const {
|
|
return vec_all_eq(this->value, (__m128i)vec_splats(0));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere() const {
|
|
return !bits_not_set_anywhere();
|
|
}
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
|
|
return vec_all_eq(vec_and(this->value, (__m128i)bits),
|
|
(__m128i)vec_splats(0));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
|
|
return !bits_not_set_anywhere(bits);
|
|
}
|
|
template <int N> simdjson_inline simd8<uint8_t> shr() const {
|
|
return simd8<uint8_t>(
|
|
(__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
|
|
}
|
|
template <int N> simdjson_inline simd8<uint8_t> shl() const {
|
|
return simd8<uint8_t>(
|
|
(__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
|
|
}
|
|
};
|
|
|
|
template <typename T> struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4,
|
|
"PPC64 kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
|
|
simd8x64<T> &
|
|
operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
|
|
const simd8<T> chunk2, const simd8<T> chunk3)
|
|
: chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64])
|
|
: chunks{simd8<T>::load(ptr), simd8<T>::load(ptr + 16),
|
|
simd8<T>::load(ptr + 32), simd8<T>::load(ptr + 48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr + sizeof(simd8<T>) * 0);
|
|
this->chunks[1].store(ptr + sizeof(simd8<T>) * 1);
|
|
this->chunks[2].store(ptr + sizeof(simd8<T>) * 2);
|
|
this->chunks[3].store(ptr + sizeof(simd8<T>) * 3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) |
|
|
(this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T *output) const {
|
|
this->chunks[0].compress(uint16_t(mask), output);
|
|
this->chunks[1].compress(uint16_t(mask >> 16),
|
|
output + 16 - count_ones(mask & 0xFFFF));
|
|
this->chunks[2].compress(uint16_t(mask >> 32),
|
|
output + 32 - count_ones(mask & 0xFFFFFFFF));
|
|
this->chunks[3].compress(uint16_t(mask >> 48),
|
|
output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
|
|
uint64_t r1 = this->chunks[1].to_bitmask();
|
|
uint64_t r2 = this->chunks[2].to_bitmask();
|
|
uint64_t r3 = this->chunks[3].to_bitmask();
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
|
|
this->chunks[2] == mask, this->chunks[3] == mask)
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3])
|
|
.to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask, this->chunks[3] <= mask)
|
|
.to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_SIMD_INPUT_H
|
|
/* end file simdjson/ppc64/simd.h */
|
|
/* including simdjson/ppc64/stringparsing_defs.h: #include "simdjson/ppc64/stringparsing_defs.h" */
|
|
/* begin file simdjson/ppc64/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_PPC64_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_PPC64_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/simd.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote
|
|
copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() {
|
|
return ((bs_bits - 1) & quote_bits) != 0;
|
|
}
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() {
|
|
return trailing_zeroes(quote_bits);
|
|
}
|
|
simdjson_inline int backslash_index() {
|
|
return trailing_zeroes(bs_bits);
|
|
}
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote
|
|
backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1),
|
|
"backslash and quote finder must process fewer than "
|
|
"SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + sizeof(v0));
|
|
v0.store(dst);
|
|
v1.store(dst + sizeof(v0));
|
|
|
|
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on
|
|
// PPC; therefore, we smash them together into a 64-byte mask and get the
|
|
// bitmask from there.
|
|
uint64_t bs_and_quote =
|
|
simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_PPC64_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/ppc64/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/ppc64/begin.h */
|
|
/* including generic/amalgamated.h for ppc64: #include <generic/amalgamated.h> */
|
|
/* begin file generic/amalgamated.h for ppc64 */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H)
|
|
#error generic/dependencies.h must be included before generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including generic/base.h for ppc64: #include <generic/base.h> */
|
|
/* begin file generic/base.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
struct json_character_block;
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_BASE_H
|
|
/* end file generic/base.h for ppc64 */
|
|
/* including generic/dom_parser_implementation.h for ppc64: #include <generic/dom_parser_implementation.h> */
|
|
/* begin file generic/dom_parser_implementation.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// Interface a dom parser implementation must fulfill
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3);
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input);
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file generic/dom_parser_implementation.h for ppc64 */
|
|
/* including generic/json_character_block.h for ppc64: #include <generic/json_character_block.h> */
|
|
/* begin file generic/json_character_block.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
|
|
struct json_character_block {
|
|
static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
|
|
|
simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
|
|
simdjson_inline uint64_t op() const noexcept { return _op; }
|
|
simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
|
|
|
|
uint64_t _whitespace;
|
|
uint64_t _op;
|
|
};
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
/* end file generic/json_character_block.h for ppc64 */
|
|
/* end file generic/amalgamated.h for ppc64 */
|
|
/* including generic/stage1/amalgamated.h for ppc64: #include <generic/stage1/amalgamated.h> */
|
|
/* begin file generic/stage1/amalgamated.h for ppc64 */
|
|
// Stuff other things depend on
|
|
/* including generic/stage1/base.h for ppc64: #include <generic/stage1/base.h> */
|
|
/* begin file generic/stage1/base.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer;
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader;
|
|
struct json_block;
|
|
class json_minifier;
|
|
class json_scanner;
|
|
struct json_string_block;
|
|
class json_string_scanner;
|
|
class json_structural_indexer;
|
|
|
|
} // namespace stage1
|
|
|
|
namespace utf8_validation {
|
|
struct utf8_checker;
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
/* end file generic/stage1/base.h for ppc64 */
|
|
/* including generic/stage1/buf_block_reader.h for ppc64: #include <generic/stage1/buf_block_reader.h> */
|
|
/* begin file generic/stage1/buf_block_reader.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdjson_inline size_t block_index();
|
|
simdjson_inline bool has_full_block() const;
|
|
simdjson_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdjson_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdjson_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in, uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] <= ' ') { buf[i] = '_'; }
|
|
if (!(mask & (size_t(1) << i))) { buf[i] = ' '; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_mask(uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
/* end file generic/stage1/buf_block_reader.h for ppc64 */
|
|
/* including generic/stage1/json_escape_scanner.h for ppc64: #include <generic/stage1/json_escape_scanner.h> */
|
|
/* begin file generic/stage1/json_escape_scanner.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n).
|
|
*/
|
|
struct json_escape_scanner {
|
|
/** The actual escape characters (the backslashes themselves). */
|
|
uint64_t next_is_escaped = 0ULL;
|
|
|
|
struct escaped_and_escape {
|
|
/**
|
|
* Mask of escaped characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 0100100010100101000
|
|
* n \ \ n \ \
|
|
* ```
|
|
*/
|
|
uint64_t escaped;
|
|
/**
|
|
* Mask of escape characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 1001000101001010001
|
|
* \ \ \ \ \ \ \
|
|
* ```
|
|
*/
|
|
uint64_t escape;
|
|
};
|
|
|
|
/**
|
|
* Get a mask of both escape and escaped characters (the characters following a backslash).
|
|
*
|
|
* @param potential_escape A mask of the character that can escape others (but could be
|
|
* escaped itself). e.g. block.eq('\\')
|
|
*/
|
|
simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept {
|
|
|
|
#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
if (!backslash) { return {next_escaped_without_backslashes(), 0}; }
|
|
#endif
|
|
|
|
// | | Mask (shows characters instead of 1's) | Depth | Instructions |
|
|
// |--------------------------------|----------------------------------------|-------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | |
|
|
// | | ` even odd even odd odd` | | |
|
|
// | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code())
|
|
// | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped))
|
|
// | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash)
|
|
// | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) ()
|
|
// (*) this is not needed until the next iteration
|
|
uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped);
|
|
uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped);
|
|
uint64_t escape = escape_and_terminal_code & backslash;
|
|
this->next_is_escaped = escape >> 63;
|
|
return {escaped, escape};
|
|
}
|
|
|
|
private:
|
|
static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL;
|
|
|
|
simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept {
|
|
uint64_t escaped = this->next_is_escaped;
|
|
this->next_is_escaped = 0;
|
|
return escaped;
|
|
}
|
|
|
|
/**
|
|
* Returns a mask of the next escape characters (masking out escaped backslashes), along with
|
|
* any non-backslash escape codes.
|
|
*
|
|
* \n \\n \\\n \\\\n returns:
|
|
* \n \ \ \n \ \
|
|
* 11 100 1011 10100
|
|
*
|
|
* You are expected to mask out the first bit yourself if the previous block had a trailing
|
|
* escape.
|
|
*
|
|
* & the result with potential_escape to get just the escape characters.
|
|
* ^ the result with (potential_escape | first_is_escaped) to get escaped characters.
|
|
*/
|
|
static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept {
|
|
// If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer:
|
|
// any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be
|
|
// inverted (\\\ would be 010 instead of 101).
|
|
//
|
|
// ```
|
|
// string: | ____\\\\_\\\\_____ |
|
|
// maybe_escaped | ODD | \ \ \ \ |
|
|
// even-aligned ^^^ ^^^^ odd-aligned
|
|
// ```
|
|
//
|
|
// Taking that into account, our basic strategy is:
|
|
//
|
|
// 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for
|
|
// odd-aligned runs.
|
|
// 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the
|
|
// odd bits in odd-aligned runs.
|
|
// 3. & with backslash to clean up any stray bits.
|
|
// runs are set to 0, and then XORing with "odd":
|
|
//
|
|
// | | Mask (shows characters instead of 1's) | Instructions |
|
|
// |--------------------------------|----------------------------------------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` |
|
|
// | | ` even odd even odd odd` |
|
|
// | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1)
|
|
// | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd)
|
|
// | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd)
|
|
//
|
|
|
|
// Escaped characters are characters following an escape.
|
|
uint64_t maybe_escaped = potential_escape << 1;
|
|
|
|
// To distinguish odd from even escape sequences, therefore, we turn on any *starting*
|
|
// escapes that are on an odd byte. (We actually bring in all odd bits, for speed.)
|
|
// - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1.
|
|
// - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0.
|
|
// - All other odd bytes are 1, and even bytes are 0.
|
|
uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS;
|
|
uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape;
|
|
|
|
// Now we flip all odd bytes back with xor. This:
|
|
// - Makes odd runs of backslashes go from 0000 to 1010
|
|
// - Makes even runs of backslashes go from 1111 to 1010
|
|
// - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100)
|
|
// - Resets all other bytes to 0
|
|
return even_series_codes_and_odd_bits ^ ODD_BITS;
|
|
}
|
|
};
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_escape_scanner.h for ppc64 */
|
|
/* including generic/stage1/json_string_scanner.h for ppc64: #include <generic/stage1/json_string_scanner.h> */
|
|
/* begin file generic/stage1/json_string_scanner.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_escape_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
struct json_string_block {
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) :
|
|
_escaped(escaped), _quote(quote), _in_string(in_string) {}
|
|
|
|
// Escaped characters (characters following an escape() character)
|
|
simdjson_really_inline uint64_t escaped() const { return _escaped; }
|
|
// Real (non-backslashed) quotes
|
|
simdjson_really_inline uint64_t quote() const { return _quote; }
|
|
// Only characters inside the string (not including the quotes)
|
|
simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
|
|
// Tail of string (everything except the start quote)
|
|
simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
|
|
|
|
// escaped characters (backslashed--does not include the hex characters after \u)
|
|
uint64_t _escaped;
|
|
// real quotes (non-escaped ones)
|
|
uint64_t _quote;
|
|
// string characters (includes start quote but not end quote)
|
|
uint64_t _in_string;
|
|
};
|
|
|
|
// Scans blocks for string characters, storing the state necessary to do so
|
|
class json_string_scanner {
|
|
public:
|
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_really_inline error_code finish();
|
|
|
|
private:
|
|
// Scans for escape characters
|
|
json_escape_scanner escape_scanner{};
|
|
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
|
uint64_t prev_in_string = 0ULL;
|
|
};
|
|
|
|
//
|
|
// Return a mask of all string characters plus end quotes.
|
|
//
|
|
// prev_escaped is overflow saying whether the next character is escaped.
|
|
// prev_in_string is overflow saying whether we're still in a string.
|
|
//
|
|
// Backslash sequences outside of quotes will be detected in stage 2.
|
|
//
|
|
simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
const uint64_t backslash = in.eq('\\');
|
|
const uint64_t escaped = escape_scanner.next(backslash).escaped;
|
|
const uint64_t quote = in.eq('"') & ~escaped;
|
|
|
|
//
|
|
// prefix_xor flips on bits inside the string (and flips off the end quote).
|
|
//
|
|
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
|
|
// (characters inside strings are outside, and characters outside strings are inside).
|
|
//
|
|
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
|
|
|
|
//
|
|
// Check if we're still in a string at the end of the box so the next block will know
|
|
//
|
|
prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
|
|
|
|
// Use ^ to turn the beginning quote off, and the end quote on.
|
|
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_string_block(escaped, quote, in_string);
|
|
}
|
|
|
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
|
if (prev_in_string) {
|
|
return UNCLOSED_STRING;
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_string_scanner.h for ppc64 */
|
|
/* including generic/stage1/utf8_lookup4_algorithm.h for ppc64: #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* begin file generic/stage1/utf8_lookup4_algorithm.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = must_be_2_3_continuation(prev2, prev3);
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
static const uint8_t max_array[64] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#else
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#endif
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdjson_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdjson_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
|
|
||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
|
|
|| (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support one, two or four chunks per 64-byte block.");
|
|
SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdjson_inline error_code errors() {
|
|
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
/* end file generic/stage1/utf8_lookup4_algorithm.h for ppc64 */
|
|
/* including generic/stage1/json_scanner.h for ppc64: #include <generic/stage1/json_scanner.h> */
|
|
/* begin file generic/stage1/json_scanner.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/json_character_block.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* A block of scanned json, with information on operators and scalars.
|
|
*
|
|
* We seek to identify pseudo-structural characters. Anything that is inside
|
|
* a string must be omitted (hence & ~_string.string_tail()).
|
|
* Otherwise, pseudo-structural characters come in two forms.
|
|
* 1. We have the structural characters ([,],{,},:, comma). The
|
|
* term 'structural character' is from the JSON RFC.
|
|
* 2. We have the 'scalar pseudo-structural characters'.
|
|
* Scalars are quotes, and any character except structural characters and white space.
|
|
*
|
|
* To identify the scalar pseudo-structural characters, we must look at what comes
|
|
* before them: it must be a space, a quote or a structural characters.
|
|
* Starting with simdjson v0.3, we identify them by
|
|
* negation: we identify everything that is followed by a non-quote scalar,
|
|
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
|
*/
|
|
struct json_block {
|
|
public:
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
|
|
/**
|
|
* The start of structurals.
|
|
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
|
**/
|
|
simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
|
|
/** All JSON whitespace (i.e. not in a string) */
|
|
simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
|
|
|
|
// Helpers
|
|
|
|
/** Whether the given characters are inside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
|
|
/** Whether the given characters are outside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
|
|
|
|
// string and escape characters
|
|
json_string_block _string;
|
|
// whitespace, structural characters ('operators'), scalars
|
|
json_character_block _characters;
|
|
// whether the previous character was a scalar
|
|
uint64_t _follows_potential_nonquote_scalar;
|
|
private:
|
|
// Potential structurals (i.e. disregarding strings)
|
|
|
|
/**
|
|
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
|
* They may reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
|
|
/**
|
|
* The start of non-operator runs, like 123, true and "abc".
|
|
* It main reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_scalar_start() const noexcept {
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
|
// then we know that it is irrelevant structurally.
|
|
return _characters.scalar() & ~follows_potential_scalar();
|
|
}
|
|
/**
|
|
* Whether the given character is immediately after a non-operator like 123, true.
|
|
* The characters following a quote are not included.
|
|
*/
|
|
simdjson_inline uint64_t follows_potential_scalar() const noexcept {
|
|
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
|
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
|
// white space.
|
|
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
|
return _follows_potential_nonquote_scalar;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
|
*
|
|
* The scanner starts by calculating two distinct things:
|
|
* - string characters (taking \" into account)
|
|
* - structural characters or 'operators' ([]{},:, comma)
|
|
* and scalars (runs of non-operators like 123, true and "abc")
|
|
*
|
|
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
|
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
|
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
|
|
* part of a string.
|
|
*/
|
|
class json_scanner {
|
|
public:
|
|
json_scanner() = default;
|
|
simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_inline error_code finish();
|
|
|
|
private:
|
|
// Whether the last character of the previous iteration is part of a scalar token
|
|
// (anything except whitespace or a structural character/'operator').
|
|
uint64_t prev_scalar = 0ULL;
|
|
json_string_scanner string_scanner{};
|
|
};
|
|
|
|
|
|
//
|
|
// Check if the current character immediately follows a matching character.
|
|
//
|
|
// For example, this checks for quotes with backslashes in front of them:
|
|
//
|
|
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
|
//
|
|
simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|
const uint64_t result = match << 1 | overflow;
|
|
overflow = match >> 63;
|
|
return result;
|
|
}
|
|
|
|
simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
json_string_block strings = string_scanner.next(in);
|
|
// identifies the white-space and the structural characters
|
|
json_character_block characters = json_character_block::classify(in);
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
|
//
|
|
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
|
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
|
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
|
// may need to add an extra check when parsing strings.
|
|
//
|
|
// Performance: there are many ways to skin this cat.
|
|
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
|
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_block(
|
|
strings,// strings is a function-local object so either it moves or the copy is elided.
|
|
characters,
|
|
follows_nonquote_scalar
|
|
);
|
|
}
|
|
|
|
simdjson_inline error_code json_scanner::finish() {
|
|
return string_scanner.finish();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
/* end file generic/stage1/json_scanner.h for ppc64 */
|
|
|
|
// All other declarations
|
|
/* including generic/stage1/find_next_document_index.h for ppc64: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for ppc64 */
|
|
/* including generic/stage1/json_minifier.h for ppc64: #include <generic/stage1/json_minifier.h> */
|
|
/* begin file generic/stage1/json_minifier.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class json_minifier {
|
|
public:
|
|
template<size_t STEP_SIZE>
|
|
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_minifier(uint8_t *_dst)
|
|
: dst{_dst}
|
|
{}
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
|
|
simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
|
|
json_scanner scanner{};
|
|
uint8_t *dst;
|
|
};
|
|
|
|
simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
|
uint64_t mask = block.whitespace();
|
|
dst += in.compress(mask, dst);
|
|
}
|
|
|
|
simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
|
error_code error = scanner.finish();
|
|
if (error) { dst_len = 0; return error; }
|
|
dst_len = dst - dst_start;
|
|
return SUCCESS;
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
simd::simd8x64<uint8_t> in_2(block_buf+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1);
|
|
this->next(in_2, block_2);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(block_buf, block_1);
|
|
reader.advance();
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_minifier minifier(dst);
|
|
|
|
// Index the first n-1 blocks
|
|
while (reader.has_full_block()) {
|
|
minifier.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
|
|
// Index the last (remainder) block, padded with spaces
|
|
uint8_t block[STEP_SIZE];
|
|
size_t remaining_bytes = reader.get_remainder(block);
|
|
if (remaining_bytes > 0) {
|
|
// We do not want to write directly to the output stream. Rather, we write
|
|
// to a local buffer (for safety).
|
|
uint8_t out_block[STEP_SIZE];
|
|
uint8_t * const guarded_dst{minifier.dst};
|
|
minifier.dst = out_block;
|
|
minifier.step<STEP_SIZE>(block, reader);
|
|
size_t to_write = minifier.dst - out_block;
|
|
// In some cases, we could be enticed to consider the padded spaces
|
|
// as part of the string. This is fine as long as we do not write more
|
|
// than we consumed.
|
|
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
|
|
memcpy(guarded_dst, out_block, to_write);
|
|
minifier.dst = guarded_dst + to_write;
|
|
}
|
|
return minifier.finish(dst, dst_len);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
/* end file generic/stage1/json_minifier.h for ppc64 */
|
|
/* including generic/stage1/json_structural_indexer.h for ppc64: #include <generic/stage1/json_structural_indexer.h> */
|
|
/* begin file generic/stage1/json_structural_indexer.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_minifier.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/find_next_document_index.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer {
|
|
public:
|
|
uint32_t *tail;
|
|
|
|
simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
/**
|
|
* ARM lacks a fast trailing zero instruction, but it has a fast
|
|
* bit reversal instruction and a fast leading zero instruction.
|
|
* Thus it may be profitable to reverse the bits (once) and then
|
|
* to rely on a sequence of instructions that call the leading
|
|
* zero instruction.
|
|
*
|
|
* Performance notes:
|
|
* The chosen routine is not optimal in terms of data dependency
|
|
* since zero_leading_bit might require two instructions. However,
|
|
* it tends to minimize the total number of instructions which is
|
|
* beneficial.
|
|
*/
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
|
|
int lz = leading_zeroes(rev_bits);
|
|
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
|
rev_bits = zero_leading_bit(rev_bits, lz);
|
|
}
|
|
#else
|
|
/**
|
|
* Under recent x64 systems, we often have both a fast trailing zero
|
|
* instruction and a fast 'clear-lower-bit' instruction so the following
|
|
* algorithm can be competitive.
|
|
*/
|
|
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
|
|
this->tail[i] = idx + trailing_zeroes(bits);
|
|
bits = clear_lowest_bit(bits);
|
|
}
|
|
#endif // SIMDJSON_PREFER_REVERSE_BITS
|
|
|
|
template <int START, int N>
|
|
simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
|
|
write_index(idx, bits, START);
|
|
SIMDJSON_IF_CONSTEXPR (N > 1) {
|
|
write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
|
|
}
|
|
return START+N;
|
|
}
|
|
|
|
template <int START, int END, int STEP>
|
|
simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
|
|
write_indexes<START, STEP>(idx, bits);
|
|
SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) {
|
|
if (simdjson_unlikely((START+STEP) < cnt)) {
|
|
write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
|
|
}
|
|
}
|
|
return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
|
|
}
|
|
|
|
// flatten out values in 'bits' assuming that they are are to have values of idx
|
|
// plus their position in the bitvector, and store these indexes at
|
|
// base_ptr[base] incrementing base as we go
|
|
// will potentially store extra values beyond end of valid bits, so base_ptr
|
|
// needs to be large enough to handle this
|
|
//
|
|
// If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it
|
|
// will provide its own version of the code.
|
|
#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits);
|
|
#else
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0)
|
|
return;
|
|
|
|
int cnt = static_cast<int>(count_ones(bits));
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
bits = reverse_bits(bits);
|
|
#endif
|
|
#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
|
|
static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
|
|
#else
|
|
static constexpr const int STEP = 4;
|
|
#endif
|
|
static constexpr const int STEP_UNTIL = 24;
|
|
|
|
write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
|
|
SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
|
|
if (simdjson_unlikely(STEP_UNTIL < cnt)) {
|
|
for (int i=STEP_UNTIL; i<cnt; i++) {
|
|
write_index(idx, bits, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
this->tail += cnt;
|
|
}
|
|
#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
};
|
|
|
|
class json_structural_indexer {
|
|
public:
|
|
/**
|
|
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
|
|
*
|
|
* @param partial Setting the partial parameter to true allows the find_structural_bits to
|
|
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
|
|
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
|
*/
|
|
template<size_t STEP_SIZE>
|
|
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
|
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
|
|
|
json_scanner scanner{};
|
|
utf8_checker checker{};
|
|
bit_indexer indexer;
|
|
uint64_t prev_structurals = 0;
|
|
uint64_t unescaped_chars_error = 0;
|
|
};
|
|
|
|
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
|
|
|
|
// Skip the last character if it is partial
|
|
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
|
|
if (simdjson_unlikely(len < 3)) {
|
|
switch (len) {
|
|
case 2:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
|
|
return len;
|
|
case 1:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return len;
|
|
case 0:
|
|
return len;
|
|
}
|
|
}
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// PERF NOTES:
|
|
// We pipe 2 inputs through these stages:
|
|
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
|
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
|
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
|
|
// The output of step 1 depends entirely on this information. These functions don't quite use
|
|
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
|
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
|
// they can make a lot of progress before they need that information.
|
|
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
|
|
// to finish: utf-8 checks and generating the output from the last iteration.
|
|
//
|
|
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
|
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
|
// workout.
|
|
//
|
|
template<size_t STEP_SIZE>
|
|
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
|
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
|
// We guard the rest of the code so that we can assume that len > 0 throughout.
|
|
if (len == 0) { return EMPTY; }
|
|
if (is_streaming(partial)) {
|
|
len = trim_partial_utf8(buf, len);
|
|
// If you end up with an empty window after trimming
|
|
// the partial UTF-8 bytes, then chances are good that you
|
|
// have an UTF-8 formatting error.
|
|
if(len == 0) { return UTF8_ERROR; }
|
|
}
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_structural_indexer indexer(parser.structural_indexes.get());
|
|
|
|
// Read all but the last block
|
|
while (reader.has_full_block()) {
|
|
indexer.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
// Take care of the last block (will always be there unless file is empty which is
|
|
// not supposed to happen.)
|
|
uint8_t block[STEP_SIZE];
|
|
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
|
|
indexer.step<STEP_SIZE>(block, reader);
|
|
return indexer.finish(parser, reader.block_index(), len, partial);
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
simd::simd8x64<uint8_t> in_2(block+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
this->next(in_2, block_2, reader.block_index()+64);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
reader.advance();
|
|
}
|
|
|
|
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
|
|
uint64_t unescaped = in.lteq(0x1F);
|
|
#if SIMDJSON_UTF8VALIDATION
|
|
checker.check_next_input(in);
|
|
#endif
|
|
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
|
|
prev_structurals = block.structural_start();
|
|
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
|
}
|
|
|
|
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
|
// Write out the final iteration's structurals
|
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
|
error_code error = scanner.finish();
|
|
// We deliberately break down the next expression so that it is
|
|
// human readable.
|
|
const bool should_we_exit = is_streaming(partial) ?
|
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
|
|
|
if (unescaped_chars_error) {
|
|
return UNESCAPED_CHARS;
|
|
}
|
|
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
|
/***
|
|
* The On Demand API requires special padding.
|
|
*
|
|
* This is related to https://github.com/simdjson/simdjson/issues/906
|
|
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
|
* structural character, it quickly stops.
|
|
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
|
|
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
|
|
* continues, then it must be [,] or }.
|
|
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
|
|
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
|
|
* It can't be a comma, a colon or any simple value. So the only way we could continue is
|
|
* if the repeated character is [. But if so, the document must start with [. But if the document
|
|
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
|
* ][[ which is invalid.
|
|
*
|
|
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
|
* R"({ "a": [,,)"
|
|
**/
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
|
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
|
parser.next_structural_index = 0;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
return EMPTY;
|
|
}
|
|
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
|
return UNEXPECTED_ERROR;
|
|
}
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
// If we have an unclosed string, then the last structural
|
|
// will be the quote and we want to make sure to omit it.
|
|
if(have_unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if (partial == stage1_mode::streaming_final) {
|
|
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
|
// often load their data in bulk without being careful and they want us to ignore
|
|
// the trailing garbage.
|
|
return EMPTY;
|
|
}
|
|
}
|
|
checker.check_eof();
|
|
return checker.errors();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
/* end file generic/stage1/json_structural_indexer.h for ppc64 */
|
|
/* including generic/stage1/utf8_validator.h for ppc64: #include <generic/stage1/utf8_validator.h> */
|
|
/* begin file generic/stage1/utf8_validator.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return c.errors() == error_code::SUCCESS;
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
/* end file generic/stage1/utf8_validator.h for ppc64 */
|
|
/* end file generic/stage1/amalgamated.h for ppc64 */
|
|
/* including generic/stage2/amalgamated.h for ppc64: #include <generic/stage2/amalgamated.h> */
|
|
/* begin file generic/stage2/amalgamated.h for ppc64 */
|
|
// Stuff other things depend on
|
|
/* including generic/stage2/base.h for ppc64: #include <generic/stage2/base.h> */
|
|
/* begin file generic/stage2/base.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator;
|
|
class structural_iterator;
|
|
struct tape_builder;
|
|
struct tape_writer;
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
/* end file generic/stage2/base.h for ppc64 */
|
|
/* including generic/stage2/tape_writer.h for ppc64: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for ppc64 */
|
|
/* including generic/stage2/logger.h for ppc64: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for ppc64 */
|
|
|
|
// All other declarations
|
|
/* including generic/stage2/json_iterator.h for ppc64: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for ppc64 */
|
|
/* including generic/stage2/stringparsing.h for ppc64: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for ppc64 */
|
|
/* including generic/stage2/structural_iterator.h for ppc64: #include <generic/stage2/structural_iterator.h> */
|
|
/* begin file generic/stage2/structural_iterator.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class structural_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
|
|
// Start a structural
|
|
simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
// Get the buffer position of the current structural character
|
|
simdjson_inline const uint8_t* current() {
|
|
return &buf[*(next_structural-1)];
|
|
}
|
|
// Get the current structural character
|
|
simdjson_inline char current_char() {
|
|
return buf[*(next_structural-1)];
|
|
}
|
|
// Get the next structural character without advancing
|
|
simdjson_inline char peek_next_char() {
|
|
return buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* peek() {
|
|
return &buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* advance() {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline char advance_char() {
|
|
return buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t remaining_len() {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool at_end() {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool at_beginning() {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
};
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
/* end file generic/stage2/structural_iterator.h for ppc64 */
|
|
/* including generic/stage2/tape_builder.h for ppc64: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for ppc64 */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for ppc64 */
|
|
/* end file generic/stage2/amalgamated.h for ppc64 */
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
|
const simd8<uint8_t> table1(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
|
|
const simd8<uint8_t> table2(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
|
|
|
|
simd8x64<uint8_t> v(
|
|
(in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
|
|
(in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
|
|
(in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
|
|
(in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
|
|
);
|
|
|
|
uint64_t op = simd8x64<bool>(
|
|
v.chunks[0].any_bits_set(0x7),
|
|
v.chunks[1].any_bits_set(0x7),
|
|
v.chunks[2].any_bits_set(0x7),
|
|
v.chunks[3].any_bits_set(0x7)
|
|
).to_bitmask();
|
|
|
|
uint64_t whitespace = simd8x64<bool>(
|
|
v.chunks[0].any_bits_set(0x18),
|
|
v.chunks[1].any_bits_set(0x18),
|
|
v.chunks[2].any_bits_set(0x18),
|
|
v.chunks[3].any_bits_set(0x18)
|
|
).to_bitmask();
|
|
|
|
return { whitespace, op };
|
|
}
|
|
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
// careful: 0x80 is not ascii.
|
|
return input.reduce_or().saturating_sub(0x7fu).bits_not_set_anywhere();
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
|
|
return is_third_byte | is_fourth_byte;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdjson {
|
|
namespace ppc64 {
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
|
|
}
|
|
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return ppc64::stage1::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
|
|
return ppc64::stringparsing::parse_string(src, dst, replacement_char);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return ppc64::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace ppc64
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/ppc64/end.h: #include <simdjson/ppc64/end.h> */
|
|
/* begin file simdjson/ppc64/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "ppc64" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/ppc64/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_PPC64_CPP
|
|
/* end file ppc64.cpp */
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_WESTMERE
|
|
/* including westmere.cpp: #include <westmere.cpp> */
|
|
/* begin file westmere.cpp */
|
|
#ifndef SIMDJSON_SRC_WESTMERE_CPP
|
|
#define SIMDJSON_SRC_WESTMERE_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/westmere.h: #include <simdjson/westmere.h> */
|
|
/* begin file simdjson/westmere.h */
|
|
#ifndef SIMDJSON_WESTMERE_H
|
|
#define SIMDJSON_WESTMERE_H
|
|
|
|
/* including simdjson/westmere/begin.h: #include "simdjson/westmere/begin.h" */
|
|
/* begin file simdjson/westmere/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "westmere" */
|
|
#define SIMDJSON_IMPLEMENTATION westmere
|
|
/* including simdjson/westmere/base.h: #include "simdjson/westmere/base.h" */
|
|
/* begin file simdjson/westmere/base.h */
|
|
#ifndef SIMDJSON_WESTMERE_BASE_H
|
|
#define SIMDJSON_WESTMERE_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Westmere (Intel SSE4.2).
|
|
*/
|
|
namespace westmere {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BASE_H
|
|
/* end file simdjson/westmere/base.h */
|
|
/* including simdjson/westmere/intrinsics.h: #include "simdjson/westmere/intrinsics.h" */
|
|
/* begin file simdjson/westmere/intrinsics.h */
|
|
#ifndef SIMDJSON_WESTMERE_INTRINSICS_H
|
|
#define SIMDJSON_WESTMERE_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
*/
|
|
#include <smmintrin.h> // for _mm_alignr_epi8
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
#endif
|
|
|
|
static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for westmere");
|
|
|
|
#endif // SIMDJSON_WESTMERE_INTRINSICS_H
|
|
/* end file simdjson/westmere/intrinsics.h */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
|
|
SIMDJSON_TARGET_REGION("sse4.2,pclmul,popcnt")
|
|
#endif
|
|
|
|
/* including simdjson/westmere/bitmanipulation.h: #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* begin file simdjson/westmere/bitmanipulation.h */
|
|
#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
#define SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
/* end file simdjson/westmere/bitmanipulation.h */
|
|
/* including simdjson/westmere/bitmask.h: #include "simdjson/westmere/bitmask.h" */
|
|
/* begin file simdjson/westmere/bitmask.h */
|
|
#ifndef SIMDJSON_WESTMERE_BITMASK_H
|
|
#define SIMDJSON_WESTMERE_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
|
|
// There should be no such thing with a processing supporting avx2
|
|
// but not clmul.
|
|
__m128i all_ones = _mm_set1_epi8('\xFF');
|
|
__m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
|
|
return _mm_cvtsi128_si64(result);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BITMASK_H
|
|
/* end file simdjson/westmere/bitmask.h */
|
|
/* including simdjson/westmere/numberparsing_defs.h: #include "simdjson/westmere/numberparsing_defs.h" */
|
|
/* begin file simdjson/westmere/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H
|
|
|
|
/* including simdjson/westmere/base.h: #include "simdjson/westmere/base.h" */
|
|
/* begin file simdjson/westmere/base.h */
|
|
#ifndef SIMDJSON_WESTMERE_BASE_H
|
|
#define SIMDJSON_WESTMERE_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Westmere (Intel SSE4.2).
|
|
*/
|
|
namespace westmere {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BASE_H
|
|
/* end file simdjson/westmere/base.h */
|
|
/* including simdjson/westmere/intrinsics.h: #include "simdjson/westmere/intrinsics.h" */
|
|
/* begin file simdjson/westmere/intrinsics.h */
|
|
#ifndef SIMDJSON_WESTMERE_INTRINSICS_H
|
|
#define SIMDJSON_WESTMERE_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
*/
|
|
#include <smmintrin.h> // for _mm_alignr_epi8
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
#endif
|
|
|
|
static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for westmere");
|
|
|
|
#endif // SIMDJSON_WESTMERE_INTRINSICS_H
|
|
/* end file simdjson/westmere/intrinsics.h */
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace numberparsing {
|
|
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
// this actually computes *16* values so we are being wasteful.
|
|
const __m128i ascii0 = _mm_set1_epi8('0');
|
|
const __m128i mul_1_10 =
|
|
_mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
|
|
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
|
|
const __m128i mul_1_10000 =
|
|
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
|
|
const __m128i input = _mm_sub_epi8(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
|
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
|
|
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
|
|
const __m128i t3 = _mm_packus_epi32(t2, t2);
|
|
const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
|
|
return _mm_cvtsi128_si32(
|
|
t4); // only captures the sum of the first 8 digits, drop the rest
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/westmere/numberparsing_defs.h */
|
|
/* including simdjson/westmere/simd.h: #include "simdjson/westmere/simd.h" */
|
|
/* begin file simdjson/westmere/simd.h */
|
|
#ifndef SIMDJSON_WESTMERE_SIMD_H
|
|
#define SIMDJSON_WESTMERE_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template<typename Child>
|
|
struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i&() const { return this->value; }
|
|
simdjson_inline operator __m128i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
|
|
simdjson_inline bool any() const { return !_mm_testz_si128(*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm_setzero_si128(); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
__m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = _mm_shuffle_epi8(*this, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = _mm_shuffle_epi8(pruned, compactmask);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
this->chunks[0].compress(uint16_t(mask), output);
|
|
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
|
|
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
|
|
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
|
|
uint64_t r1 = this->chunks[1].to_bitmask() ;
|
|
uint64_t r2 = this->chunks[2].to_bitmask() ;
|
|
uint64_t r3 = this->chunks[3].to_bitmask() ;
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
|
|
/* end file simdjson/westmere/simd.h */
|
|
/* including simdjson/westmere/stringparsing_defs.h: #include "simdjson/westmere/stringparsing_defs.h" */
|
|
/* begin file simdjson/westmere/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H
|
|
|
|
/* including simdjson/westmere/bitmanipulation.h: #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* begin file simdjson/westmere/bitmanipulation.h */
|
|
#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
#define SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
/* end file simdjson/westmere/bitmanipulation.h */
|
|
/* including simdjson/westmere/simd.h: #include "simdjson/westmere/simd.h" */
|
|
/* begin file simdjson/westmere/simd.h */
|
|
#ifndef SIMDJSON_WESTMERE_SIMD_H
|
|
#define SIMDJSON_WESTMERE_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template<typename Child>
|
|
struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i&() const { return this->value; }
|
|
simdjson_inline operator __m128i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
|
|
simdjson_inline bool any() const { return !_mm_testz_si128(*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm_setzero_si128(); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
__m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = _mm_shuffle_epi8(*this, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = _mm_shuffle_epi8(pruned, compactmask);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
this->chunks[0].compress(uint16_t(mask), output);
|
|
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
|
|
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
|
|
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
|
|
uint64_t r1 = this->chunks[1].to_bitmask() ;
|
|
uint64_t r2 = this->chunks[2].to_bitmask() ;
|
|
uint64_t r3 = this->chunks[3].to_bitmask() ;
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
|
|
/* end file simdjson/westmere/simd.h */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + 16);
|
|
v0.store(dst);
|
|
v1.store(dst + 16);
|
|
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/westmere/stringparsing_defs.h */
|
|
/* end file simdjson/westmere/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for westmere: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for westmere */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for westmere: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for westmere */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for westmere */
|
|
/* including simdjson/generic/jsoncharutils.h for westmere: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for westmere */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for westmere */
|
|
/* including simdjson/generic/atomparsing.h for westmere: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for westmere */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for westmere */
|
|
/* including simdjson/generic/dom_parser_implementation.h for westmere: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for westmere */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for westmere */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for westmere: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for westmere */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for westmere */
|
|
/* including simdjson/generic/numberparsing.h for westmere: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for westmere */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for westmere */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for westmere: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for westmere */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for westmere */
|
|
/* end file simdjson/generic/amalgamated.h for westmere */
|
|
/* including simdjson/westmere/end.h: #include "simdjson/westmere/end.h" */
|
|
/* begin file simdjson/westmere/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
|
|
SIMDJSON_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "westmere" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/westmere/end.h */
|
|
|
|
#endif // SIMDJSON_WESTMERE_H
|
|
/* end file simdjson/westmere.h */
|
|
/* including simdjson/westmere/implementation.h: #include <simdjson/westmere/implementation.h> */
|
|
/* begin file simdjson/westmere/implementation.h */
|
|
#ifndef SIMDJSON_WESTMERE_IMPLEMENTATION_H
|
|
#define SIMDJSON_WESTMERE_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_IMPLEMENTATION_H
|
|
/* end file simdjson/westmere/implementation.h */
|
|
|
|
/* including simdjson/westmere/begin.h: #include <simdjson/westmere/begin.h> */
|
|
/* begin file simdjson/westmere/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "westmere" */
|
|
#define SIMDJSON_IMPLEMENTATION westmere
|
|
/* including simdjson/westmere/base.h: #include "simdjson/westmere/base.h" */
|
|
/* begin file simdjson/westmere/base.h */
|
|
#ifndef SIMDJSON_WESTMERE_BASE_H
|
|
#define SIMDJSON_WESTMERE_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Westmere (Intel SSE4.2).
|
|
*/
|
|
namespace westmere {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BASE_H
|
|
/* end file simdjson/westmere/base.h */
|
|
/* including simdjson/westmere/intrinsics.h: #include "simdjson/westmere/intrinsics.h" */
|
|
/* begin file simdjson/westmere/intrinsics.h */
|
|
#ifndef SIMDJSON_WESTMERE_INTRINSICS_H
|
|
#define SIMDJSON_WESTMERE_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
*/
|
|
#include <smmintrin.h> // for _mm_alignr_epi8
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
#endif
|
|
|
|
static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for westmere");
|
|
|
|
#endif // SIMDJSON_WESTMERE_INTRINSICS_H
|
|
/* end file simdjson/westmere/intrinsics.h */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
|
|
SIMDJSON_TARGET_REGION("sse4.2,pclmul,popcnt")
|
|
#endif
|
|
|
|
/* including simdjson/westmere/bitmanipulation.h: #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* begin file simdjson/westmere/bitmanipulation.h */
|
|
#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
#define SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
/* end file simdjson/westmere/bitmanipulation.h */
|
|
/* including simdjson/westmere/bitmask.h: #include "simdjson/westmere/bitmask.h" */
|
|
/* begin file simdjson/westmere/bitmask.h */
|
|
#ifndef SIMDJSON_WESTMERE_BITMASK_H
|
|
#define SIMDJSON_WESTMERE_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(const uint64_t bitmask) {
|
|
// There should be no such thing with a processing supporting avx2
|
|
// but not clmul.
|
|
__m128i all_ones = _mm_set1_epi8('\xFF');
|
|
__m128i result = _mm_clmulepi64_si128(_mm_set_epi64x(0ULL, bitmask), all_ones, 0);
|
|
return _mm_cvtsi128_si64(result);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BITMASK_H
|
|
/* end file simdjson/westmere/bitmask.h */
|
|
/* including simdjson/westmere/numberparsing_defs.h: #include "simdjson/westmere/numberparsing_defs.h" */
|
|
/* begin file simdjson/westmere/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H
|
|
|
|
/* including simdjson/westmere/base.h: #include "simdjson/westmere/base.h" */
|
|
/* begin file simdjson/westmere/base.h */
|
|
#ifndef SIMDJSON_WESTMERE_BASE_H
|
|
#define SIMDJSON_WESTMERE_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// The constructor may be executed on any host, so we take care not to use SIMDJSON_TARGET_WESTMERE
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for Westmere (Intel SSE4.2).
|
|
*/
|
|
namespace westmere {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BASE_H
|
|
/* end file simdjson/westmere/base.h */
|
|
/* including simdjson/westmere/intrinsics.h: #include "simdjson/westmere/intrinsics.h" */
|
|
/* begin file simdjson/westmere/intrinsics.h */
|
|
#ifndef SIMDJSON_WESTMERE_INTRINSICS_H
|
|
#define SIMDJSON_WESTMERE_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if SIMDJSON_VISUAL_STUDIO
|
|
// under clang within visual studio, this will include <x86intrin.h>
|
|
#include <intrin.h> // visual studio or clang
|
|
#else
|
|
#include <x86intrin.h> // elsewhere
|
|
#endif // SIMDJSON_VISUAL_STUDIO
|
|
|
|
|
|
#if SIMDJSON_CLANG_VISUAL_STUDIO
|
|
/**
|
|
* You are not supposed, normally, to include these
|
|
* headers directly. Instead you should either include intrin.h
|
|
* or x86intrin.h. However, when compiling with clang
|
|
* under Windows (i.e., when _MSC_VER is set), these headers
|
|
* only get included *if* the corresponding features are detected
|
|
* from macros:
|
|
*/
|
|
#include <smmintrin.h> // for _mm_alignr_epi8
|
|
#include <wmmintrin.h> // for _mm_clmulepi64_si128
|
|
#endif
|
|
|
|
static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for westmere");
|
|
|
|
#endif // SIMDJSON_WESTMERE_INTRINSICS_H
|
|
/* end file simdjson/westmere/intrinsics.h */
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace numberparsing {
|
|
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
// this actually computes *16* values so we are being wasteful.
|
|
const __m128i ascii0 = _mm_set1_epi8('0');
|
|
const __m128i mul_1_10 =
|
|
_mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
|
|
const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
|
|
const __m128i mul_1_10000 =
|
|
_mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
|
|
const __m128i input = _mm_sub_epi8(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
|
|
const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
|
|
const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
|
|
const __m128i t3 = _mm_packus_epi32(t2, t2);
|
|
const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
|
|
return _mm_cvtsi128_si32(
|
|
t4); // only captures the sum of the first 8 digits, drop the rest
|
|
}
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_WESTMERE_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/westmere/numberparsing_defs.h */
|
|
/* including simdjson/westmere/simd.h: #include "simdjson/westmere/simd.h" */
|
|
/* begin file simdjson/westmere/simd.h */
|
|
#ifndef SIMDJSON_WESTMERE_SIMD_H
|
|
#define SIMDJSON_WESTMERE_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template<typename Child>
|
|
struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i&() const { return this->value; }
|
|
simdjson_inline operator __m128i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
|
|
simdjson_inline bool any() const { return !_mm_testz_si128(*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm_setzero_si128(); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
__m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = _mm_shuffle_epi8(*this, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = _mm_shuffle_epi8(pruned, compactmask);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
this->chunks[0].compress(uint16_t(mask), output);
|
|
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
|
|
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
|
|
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
|
|
uint64_t r1 = this->chunks[1].to_bitmask() ;
|
|
uint64_t r2 = this->chunks[2].to_bitmask() ;
|
|
uint64_t r3 = this->chunks[3].to_bitmask() ;
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
|
|
/* end file simdjson/westmere/simd.h */
|
|
/* including simdjson/westmere/stringparsing_defs.h: #include "simdjson/westmere/stringparsing_defs.h" */
|
|
/* begin file simdjson/westmere/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H
|
|
|
|
/* including simdjson/westmere/bitmanipulation.h: #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* begin file simdjson/westmere/bitmanipulation.h */
|
|
#ifndef SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
#define SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long ret;
|
|
// Search the mask data from least significant bit (LSB)
|
|
// to the most significant bit (MSB) for a set bit (1).
|
|
_BitScanForward64(&ret, input_num);
|
|
return (int)ret;
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return __builtin_ctzll(input_num);
|
|
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
}
|
|
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
simdjson_inline unsigned __int64 count_ones(uint64_t input_num) {
|
|
// note: we do not support legacy 32-bit Windows in this kernel
|
|
return __popcnt64(input_num);// Visual Studio wants two underscores
|
|
}
|
|
#else
|
|
simdjson_inline long long int count_ones(uint64_t input_num) {
|
|
return _popcnt64(input_num);
|
|
}
|
|
#endif
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2,
|
|
uint64_t *result) {
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO
|
|
return _addcarry_u64(0, value1, value2,
|
|
reinterpret_cast<unsigned __int64 *>(result));
|
|
#else
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
#endif
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_BITMANIPULATION_H
|
|
/* end file simdjson/westmere/bitmanipulation.h */
|
|
/* including simdjson/westmere/simd.h: #include "simdjson/westmere/simd.h" */
|
|
/* begin file simdjson/westmere/simd.h */
|
|
#ifndef SIMDJSON_WESTMERE_SIMD_H
|
|
#define SIMDJSON_WESTMERE_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
template<typename Child>
|
|
struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i&() const { return this->value; }
|
|
simdjson_inline operator __m128i&() { return this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
typedef uint16_t bitmask_t;
|
|
typedef uint32_t bitmask2_t;
|
|
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
|
|
simdjson_inline bool any() const { return !_mm_testz_si128(*this, *this); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
|
|
static simdjson_inline simd8<T> zero() { return _mm_setzero_si128(); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return _mm_shuffle_epi8(lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
// Design consideration: it seems like a function with the
|
|
// signature simd8<L> compress(uint32_t mask) would be
|
|
// sensible, but the AVX ISA makes this kind of approach difficult.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by work done by @animetosho
|
|
// we do it in two steps, first 8 bytes and then second 8 bytes
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register, using only
|
|
// two instructions on most compilers.
|
|
__m128i shufmask = _mm_set_epi64x(thintable_epi8[mask2], thintable_epi8[mask1]);
|
|
// we increment by 0x08 the second half of the mask
|
|
shufmask =
|
|
_mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = _mm_shuffle_epi8(*this, shufmask);
|
|
// we still need to put the two halves together.
|
|
// we compute the popcount of the first half:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask, what it does is to write
|
|
// only the first pop1 bytes from the first 8 bytes, and then
|
|
// it fills in with the bytes from the second 8 bytes + some filling
|
|
// at the end.
|
|
__m128i compactmask =
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pshufb_combine_table + pop1 * 8));
|
|
__m128i answer = _mm_shuffle_epi8(pruned, compactmask);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return _mm_max_epi8(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t* values) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(_mm_setr_epi8(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
)) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm_adds_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm_subs_epu8(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return _mm_max_epu8(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return _mm_min_epu8(*this, other); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const { return _mm_testz_si128(*this, bits); }
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(_mm_srli_epi16(*this, N)) & uint8_t(0xFFu >> N); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(_mm_slli_epi16(*this, N)) & uint8_t(0xFFu << N); }
|
|
// Get one of the bits and make a bitmask out of it.
|
|
// e.g. value.get_bit<7>() gets the high bit
|
|
template<int N>
|
|
simdjson_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
this->chunks[0].compress(uint16_t(mask), output);
|
|
this->chunks[1].compress(uint16_t(mask >> 16), output + 16 - count_ones(mask & 0xFFFF));
|
|
this->chunks[2].compress(uint16_t(mask >> 32), output + 32 - count_ones(mask & 0xFFFFFFFF));
|
|
this->chunks[3].compress(uint16_t(mask >> 48), output + 48 - count_ones(mask & 0xFFFFFFFFFFFF));
|
|
return 64 - count_ones(mask);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
|
|
uint64_t r1 = this->chunks[1].to_bitmask() ;
|
|
uint64_t r2 = this->chunks[2].to_bitmask() ;
|
|
uint64_t r3 = this->chunks[3].to_bitmask() ;
|
|
return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_SIMD_INPUT_H
|
|
/* end file simdjson/westmere/simd.h */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + 16);
|
|
v0.store(dst);
|
|
v1.store(dst + 16);
|
|
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_WESTMERE_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/westmere/stringparsing_defs.h */
|
|
/* end file simdjson/westmere/begin.h */
|
|
/* including generic/amalgamated.h for westmere: #include <generic/amalgamated.h> */
|
|
/* begin file generic/amalgamated.h for westmere */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H)
|
|
#error generic/dependencies.h must be included before generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including generic/base.h for westmere: #include <generic/base.h> */
|
|
/* begin file generic/base.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
struct json_character_block;
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_BASE_H
|
|
/* end file generic/base.h for westmere */
|
|
/* including generic/dom_parser_implementation.h for westmere: #include <generic/dom_parser_implementation.h> */
|
|
/* begin file generic/dom_parser_implementation.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// Interface a dom parser implementation must fulfill
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3);
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input);
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file generic/dom_parser_implementation.h for westmere */
|
|
/* including generic/json_character_block.h for westmere: #include <generic/json_character_block.h> */
|
|
/* begin file generic/json_character_block.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
|
|
struct json_character_block {
|
|
static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
|
|
|
simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
|
|
simdjson_inline uint64_t op() const noexcept { return _op; }
|
|
simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
|
|
|
|
uint64_t _whitespace;
|
|
uint64_t _op;
|
|
};
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
/* end file generic/json_character_block.h for westmere */
|
|
/* end file generic/amalgamated.h for westmere */
|
|
/* including generic/stage1/amalgamated.h for westmere: #include <generic/stage1/amalgamated.h> */
|
|
/* begin file generic/stage1/amalgamated.h for westmere */
|
|
// Stuff other things depend on
|
|
/* including generic/stage1/base.h for westmere: #include <generic/stage1/base.h> */
|
|
/* begin file generic/stage1/base.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer;
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader;
|
|
struct json_block;
|
|
class json_minifier;
|
|
class json_scanner;
|
|
struct json_string_block;
|
|
class json_string_scanner;
|
|
class json_structural_indexer;
|
|
|
|
} // namespace stage1
|
|
|
|
namespace utf8_validation {
|
|
struct utf8_checker;
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
/* end file generic/stage1/base.h for westmere */
|
|
/* including generic/stage1/buf_block_reader.h for westmere: #include <generic/stage1/buf_block_reader.h> */
|
|
/* begin file generic/stage1/buf_block_reader.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdjson_inline size_t block_index();
|
|
simdjson_inline bool has_full_block() const;
|
|
simdjson_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdjson_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdjson_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in, uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] <= ' ') { buf[i] = '_'; }
|
|
if (!(mask & (size_t(1) << i))) { buf[i] = ' '; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_mask(uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
/* end file generic/stage1/buf_block_reader.h for westmere */
|
|
/* including generic/stage1/json_escape_scanner.h for westmere: #include <generic/stage1/json_escape_scanner.h> */
|
|
/* begin file generic/stage1/json_escape_scanner.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n).
|
|
*/
|
|
struct json_escape_scanner {
|
|
/** The actual escape characters (the backslashes themselves). */
|
|
uint64_t next_is_escaped = 0ULL;
|
|
|
|
struct escaped_and_escape {
|
|
/**
|
|
* Mask of escaped characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 0100100010100101000
|
|
* n \ \ n \ \
|
|
* ```
|
|
*/
|
|
uint64_t escaped;
|
|
/**
|
|
* Mask of escape characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 1001000101001010001
|
|
* \ \ \ \ \ \ \
|
|
* ```
|
|
*/
|
|
uint64_t escape;
|
|
};
|
|
|
|
/**
|
|
* Get a mask of both escape and escaped characters (the characters following a backslash).
|
|
*
|
|
* @param potential_escape A mask of the character that can escape others (but could be
|
|
* escaped itself). e.g. block.eq('\\')
|
|
*/
|
|
simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept {
|
|
|
|
#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
if (!backslash) { return {next_escaped_without_backslashes(), 0}; }
|
|
#endif
|
|
|
|
// | | Mask (shows characters instead of 1's) | Depth | Instructions |
|
|
// |--------------------------------|----------------------------------------|-------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | |
|
|
// | | ` even odd even odd odd` | | |
|
|
// | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code())
|
|
// | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped))
|
|
// | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash)
|
|
// | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) ()
|
|
// (*) this is not needed until the next iteration
|
|
uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped);
|
|
uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped);
|
|
uint64_t escape = escape_and_terminal_code & backslash;
|
|
this->next_is_escaped = escape >> 63;
|
|
return {escaped, escape};
|
|
}
|
|
|
|
private:
|
|
static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL;
|
|
|
|
simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept {
|
|
uint64_t escaped = this->next_is_escaped;
|
|
this->next_is_escaped = 0;
|
|
return escaped;
|
|
}
|
|
|
|
/**
|
|
* Returns a mask of the next escape characters (masking out escaped backslashes), along with
|
|
* any non-backslash escape codes.
|
|
*
|
|
* \n \\n \\\n \\\\n returns:
|
|
* \n \ \ \n \ \
|
|
* 11 100 1011 10100
|
|
*
|
|
* You are expected to mask out the first bit yourself if the previous block had a trailing
|
|
* escape.
|
|
*
|
|
* & the result with potential_escape to get just the escape characters.
|
|
* ^ the result with (potential_escape | first_is_escaped) to get escaped characters.
|
|
*/
|
|
static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept {
|
|
// If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer:
|
|
// any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be
|
|
// inverted (\\\ would be 010 instead of 101).
|
|
//
|
|
// ```
|
|
// string: | ____\\\\_\\\\_____ |
|
|
// maybe_escaped | ODD | \ \ \ \ |
|
|
// even-aligned ^^^ ^^^^ odd-aligned
|
|
// ```
|
|
//
|
|
// Taking that into account, our basic strategy is:
|
|
//
|
|
// 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for
|
|
// odd-aligned runs.
|
|
// 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the
|
|
// odd bits in odd-aligned runs.
|
|
// 3. & with backslash to clean up any stray bits.
|
|
// runs are set to 0, and then XORing with "odd":
|
|
//
|
|
// | | Mask (shows characters instead of 1's) | Instructions |
|
|
// |--------------------------------|----------------------------------------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` |
|
|
// | | ` even odd even odd odd` |
|
|
// | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1)
|
|
// | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd)
|
|
// | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd)
|
|
//
|
|
|
|
// Escaped characters are characters following an escape.
|
|
uint64_t maybe_escaped = potential_escape << 1;
|
|
|
|
// To distinguish odd from even escape sequences, therefore, we turn on any *starting*
|
|
// escapes that are on an odd byte. (We actually bring in all odd bits, for speed.)
|
|
// - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1.
|
|
// - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0.
|
|
// - All other odd bytes are 1, and even bytes are 0.
|
|
uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS;
|
|
uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape;
|
|
|
|
// Now we flip all odd bytes back with xor. This:
|
|
// - Makes odd runs of backslashes go from 0000 to 1010
|
|
// - Makes even runs of backslashes go from 1111 to 1010
|
|
// - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100)
|
|
// - Resets all other bytes to 0
|
|
return even_series_codes_and_odd_bits ^ ODD_BITS;
|
|
}
|
|
};
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_escape_scanner.h for westmere */
|
|
/* including generic/stage1/json_string_scanner.h for westmere: #include <generic/stage1/json_string_scanner.h> */
|
|
/* begin file generic/stage1/json_string_scanner.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_escape_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
struct json_string_block {
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) :
|
|
_escaped(escaped), _quote(quote), _in_string(in_string) {}
|
|
|
|
// Escaped characters (characters following an escape() character)
|
|
simdjson_really_inline uint64_t escaped() const { return _escaped; }
|
|
// Real (non-backslashed) quotes
|
|
simdjson_really_inline uint64_t quote() const { return _quote; }
|
|
// Only characters inside the string (not including the quotes)
|
|
simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
|
|
// Tail of string (everything except the start quote)
|
|
simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
|
|
|
|
// escaped characters (backslashed--does not include the hex characters after \u)
|
|
uint64_t _escaped;
|
|
// real quotes (non-escaped ones)
|
|
uint64_t _quote;
|
|
// string characters (includes start quote but not end quote)
|
|
uint64_t _in_string;
|
|
};
|
|
|
|
// Scans blocks for string characters, storing the state necessary to do so
|
|
class json_string_scanner {
|
|
public:
|
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_really_inline error_code finish();
|
|
|
|
private:
|
|
// Scans for escape characters
|
|
json_escape_scanner escape_scanner{};
|
|
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
|
uint64_t prev_in_string = 0ULL;
|
|
};
|
|
|
|
//
|
|
// Return a mask of all string characters plus end quotes.
|
|
//
|
|
// prev_escaped is overflow saying whether the next character is escaped.
|
|
// prev_in_string is overflow saying whether we're still in a string.
|
|
//
|
|
// Backslash sequences outside of quotes will be detected in stage 2.
|
|
//
|
|
simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
const uint64_t backslash = in.eq('\\');
|
|
const uint64_t escaped = escape_scanner.next(backslash).escaped;
|
|
const uint64_t quote = in.eq('"') & ~escaped;
|
|
|
|
//
|
|
// prefix_xor flips on bits inside the string (and flips off the end quote).
|
|
//
|
|
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
|
|
// (characters inside strings are outside, and characters outside strings are inside).
|
|
//
|
|
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
|
|
|
|
//
|
|
// Check if we're still in a string at the end of the box so the next block will know
|
|
//
|
|
prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
|
|
|
|
// Use ^ to turn the beginning quote off, and the end quote on.
|
|
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_string_block(escaped, quote, in_string);
|
|
}
|
|
|
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
|
if (prev_in_string) {
|
|
return UNCLOSED_STRING;
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_string_scanner.h for westmere */
|
|
/* including generic/stage1/utf8_lookup4_algorithm.h for westmere: #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* begin file generic/stage1/utf8_lookup4_algorithm.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = must_be_2_3_continuation(prev2, prev3);
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
static const uint8_t max_array[64] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#else
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#endif
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdjson_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdjson_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
|
|
||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
|
|
|| (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support one, two or four chunks per 64-byte block.");
|
|
SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdjson_inline error_code errors() {
|
|
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
/* end file generic/stage1/utf8_lookup4_algorithm.h for westmere */
|
|
/* including generic/stage1/json_scanner.h for westmere: #include <generic/stage1/json_scanner.h> */
|
|
/* begin file generic/stage1/json_scanner.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/json_character_block.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* A block of scanned json, with information on operators and scalars.
|
|
*
|
|
* We seek to identify pseudo-structural characters. Anything that is inside
|
|
* a string must be omitted (hence & ~_string.string_tail()).
|
|
* Otherwise, pseudo-structural characters come in two forms.
|
|
* 1. We have the structural characters ([,],{,},:, comma). The
|
|
* term 'structural character' is from the JSON RFC.
|
|
* 2. We have the 'scalar pseudo-structural characters'.
|
|
* Scalars are quotes, and any character except structural characters and white space.
|
|
*
|
|
* To identify the scalar pseudo-structural characters, we must look at what comes
|
|
* before them: it must be a space, a quote or a structural characters.
|
|
* Starting with simdjson v0.3, we identify them by
|
|
* negation: we identify everything that is followed by a non-quote scalar,
|
|
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
|
*/
|
|
struct json_block {
|
|
public:
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
|
|
/**
|
|
* The start of structurals.
|
|
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
|
**/
|
|
simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
|
|
/** All JSON whitespace (i.e. not in a string) */
|
|
simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
|
|
|
|
// Helpers
|
|
|
|
/** Whether the given characters are inside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
|
|
/** Whether the given characters are outside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
|
|
|
|
// string and escape characters
|
|
json_string_block _string;
|
|
// whitespace, structural characters ('operators'), scalars
|
|
json_character_block _characters;
|
|
// whether the previous character was a scalar
|
|
uint64_t _follows_potential_nonquote_scalar;
|
|
private:
|
|
// Potential structurals (i.e. disregarding strings)
|
|
|
|
/**
|
|
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
|
* They may reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
|
|
/**
|
|
* The start of non-operator runs, like 123, true and "abc".
|
|
* It main reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_scalar_start() const noexcept {
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
|
// then we know that it is irrelevant structurally.
|
|
return _characters.scalar() & ~follows_potential_scalar();
|
|
}
|
|
/**
|
|
* Whether the given character is immediately after a non-operator like 123, true.
|
|
* The characters following a quote are not included.
|
|
*/
|
|
simdjson_inline uint64_t follows_potential_scalar() const noexcept {
|
|
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
|
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
|
// white space.
|
|
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
|
return _follows_potential_nonquote_scalar;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
|
*
|
|
* The scanner starts by calculating two distinct things:
|
|
* - string characters (taking \" into account)
|
|
* - structural characters or 'operators' ([]{},:, comma)
|
|
* and scalars (runs of non-operators like 123, true and "abc")
|
|
*
|
|
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
|
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
|
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
|
|
* part of a string.
|
|
*/
|
|
class json_scanner {
|
|
public:
|
|
json_scanner() = default;
|
|
simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_inline error_code finish();
|
|
|
|
private:
|
|
// Whether the last character of the previous iteration is part of a scalar token
|
|
// (anything except whitespace or a structural character/'operator').
|
|
uint64_t prev_scalar = 0ULL;
|
|
json_string_scanner string_scanner{};
|
|
};
|
|
|
|
|
|
//
|
|
// Check if the current character immediately follows a matching character.
|
|
//
|
|
// For example, this checks for quotes with backslashes in front of them:
|
|
//
|
|
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
|
//
|
|
simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|
const uint64_t result = match << 1 | overflow;
|
|
overflow = match >> 63;
|
|
return result;
|
|
}
|
|
|
|
simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
json_string_block strings = string_scanner.next(in);
|
|
// identifies the white-space and the structural characters
|
|
json_character_block characters = json_character_block::classify(in);
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
|
//
|
|
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
|
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
|
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
|
// may need to add an extra check when parsing strings.
|
|
//
|
|
// Performance: there are many ways to skin this cat.
|
|
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
|
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_block(
|
|
strings,// strings is a function-local object so either it moves or the copy is elided.
|
|
characters,
|
|
follows_nonquote_scalar
|
|
);
|
|
}
|
|
|
|
simdjson_inline error_code json_scanner::finish() {
|
|
return string_scanner.finish();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
/* end file generic/stage1/json_scanner.h for westmere */
|
|
|
|
// All other declarations
|
|
/* including generic/stage1/find_next_document_index.h for westmere: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for westmere */
|
|
/* including generic/stage1/json_minifier.h for westmere: #include <generic/stage1/json_minifier.h> */
|
|
/* begin file generic/stage1/json_minifier.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class json_minifier {
|
|
public:
|
|
template<size_t STEP_SIZE>
|
|
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_minifier(uint8_t *_dst)
|
|
: dst{_dst}
|
|
{}
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
|
|
simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
|
|
json_scanner scanner{};
|
|
uint8_t *dst;
|
|
};
|
|
|
|
simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
|
uint64_t mask = block.whitespace();
|
|
dst += in.compress(mask, dst);
|
|
}
|
|
|
|
simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
|
error_code error = scanner.finish();
|
|
if (error) { dst_len = 0; return error; }
|
|
dst_len = dst - dst_start;
|
|
return SUCCESS;
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
simd::simd8x64<uint8_t> in_2(block_buf+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1);
|
|
this->next(in_2, block_2);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(block_buf, block_1);
|
|
reader.advance();
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_minifier minifier(dst);
|
|
|
|
// Index the first n-1 blocks
|
|
while (reader.has_full_block()) {
|
|
minifier.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
|
|
// Index the last (remainder) block, padded with spaces
|
|
uint8_t block[STEP_SIZE];
|
|
size_t remaining_bytes = reader.get_remainder(block);
|
|
if (remaining_bytes > 0) {
|
|
// We do not want to write directly to the output stream. Rather, we write
|
|
// to a local buffer (for safety).
|
|
uint8_t out_block[STEP_SIZE];
|
|
uint8_t * const guarded_dst{minifier.dst};
|
|
minifier.dst = out_block;
|
|
minifier.step<STEP_SIZE>(block, reader);
|
|
size_t to_write = minifier.dst - out_block;
|
|
// In some cases, we could be enticed to consider the padded spaces
|
|
// as part of the string. This is fine as long as we do not write more
|
|
// than we consumed.
|
|
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
|
|
memcpy(guarded_dst, out_block, to_write);
|
|
minifier.dst = guarded_dst + to_write;
|
|
}
|
|
return minifier.finish(dst, dst_len);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
/* end file generic/stage1/json_minifier.h for westmere */
|
|
/* including generic/stage1/json_structural_indexer.h for westmere: #include <generic/stage1/json_structural_indexer.h> */
|
|
/* begin file generic/stage1/json_structural_indexer.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_minifier.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/find_next_document_index.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer {
|
|
public:
|
|
uint32_t *tail;
|
|
|
|
simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
/**
|
|
* ARM lacks a fast trailing zero instruction, but it has a fast
|
|
* bit reversal instruction and a fast leading zero instruction.
|
|
* Thus it may be profitable to reverse the bits (once) and then
|
|
* to rely on a sequence of instructions that call the leading
|
|
* zero instruction.
|
|
*
|
|
* Performance notes:
|
|
* The chosen routine is not optimal in terms of data dependency
|
|
* since zero_leading_bit might require two instructions. However,
|
|
* it tends to minimize the total number of instructions which is
|
|
* beneficial.
|
|
*/
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
|
|
int lz = leading_zeroes(rev_bits);
|
|
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
|
rev_bits = zero_leading_bit(rev_bits, lz);
|
|
}
|
|
#else
|
|
/**
|
|
* Under recent x64 systems, we often have both a fast trailing zero
|
|
* instruction and a fast 'clear-lower-bit' instruction so the following
|
|
* algorithm can be competitive.
|
|
*/
|
|
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
|
|
this->tail[i] = idx + trailing_zeroes(bits);
|
|
bits = clear_lowest_bit(bits);
|
|
}
|
|
#endif // SIMDJSON_PREFER_REVERSE_BITS
|
|
|
|
template <int START, int N>
|
|
simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
|
|
write_index(idx, bits, START);
|
|
SIMDJSON_IF_CONSTEXPR (N > 1) {
|
|
write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
|
|
}
|
|
return START+N;
|
|
}
|
|
|
|
template <int START, int END, int STEP>
|
|
simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
|
|
write_indexes<START, STEP>(idx, bits);
|
|
SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) {
|
|
if (simdjson_unlikely((START+STEP) < cnt)) {
|
|
write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
|
|
}
|
|
}
|
|
return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
|
|
}
|
|
|
|
// flatten out values in 'bits' assuming that they are are to have values of idx
|
|
// plus their position in the bitvector, and store these indexes at
|
|
// base_ptr[base] incrementing base as we go
|
|
// will potentially store extra values beyond end of valid bits, so base_ptr
|
|
// needs to be large enough to handle this
|
|
//
|
|
// If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it
|
|
// will provide its own version of the code.
|
|
#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits);
|
|
#else
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0)
|
|
return;
|
|
|
|
int cnt = static_cast<int>(count_ones(bits));
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
bits = reverse_bits(bits);
|
|
#endif
|
|
#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
|
|
static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
|
|
#else
|
|
static constexpr const int STEP = 4;
|
|
#endif
|
|
static constexpr const int STEP_UNTIL = 24;
|
|
|
|
write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
|
|
SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
|
|
if (simdjson_unlikely(STEP_UNTIL < cnt)) {
|
|
for (int i=STEP_UNTIL; i<cnt; i++) {
|
|
write_index(idx, bits, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
this->tail += cnt;
|
|
}
|
|
#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
};
|
|
|
|
class json_structural_indexer {
|
|
public:
|
|
/**
|
|
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
|
|
*
|
|
* @param partial Setting the partial parameter to true allows the find_structural_bits to
|
|
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
|
|
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
|
*/
|
|
template<size_t STEP_SIZE>
|
|
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
|
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
|
|
|
json_scanner scanner{};
|
|
utf8_checker checker{};
|
|
bit_indexer indexer;
|
|
uint64_t prev_structurals = 0;
|
|
uint64_t unescaped_chars_error = 0;
|
|
};
|
|
|
|
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
|
|
|
|
// Skip the last character if it is partial
|
|
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
|
|
if (simdjson_unlikely(len < 3)) {
|
|
switch (len) {
|
|
case 2:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
|
|
return len;
|
|
case 1:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return len;
|
|
case 0:
|
|
return len;
|
|
}
|
|
}
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// PERF NOTES:
|
|
// We pipe 2 inputs through these stages:
|
|
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
|
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
|
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
|
|
// The output of step 1 depends entirely on this information. These functions don't quite use
|
|
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
|
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
|
// they can make a lot of progress before they need that information.
|
|
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
|
|
// to finish: utf-8 checks and generating the output from the last iteration.
|
|
//
|
|
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
|
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
|
// workout.
|
|
//
|
|
template<size_t STEP_SIZE>
|
|
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
|
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
|
// We guard the rest of the code so that we can assume that len > 0 throughout.
|
|
if (len == 0) { return EMPTY; }
|
|
if (is_streaming(partial)) {
|
|
len = trim_partial_utf8(buf, len);
|
|
// If you end up with an empty window after trimming
|
|
// the partial UTF-8 bytes, then chances are good that you
|
|
// have an UTF-8 formatting error.
|
|
if(len == 0) { return UTF8_ERROR; }
|
|
}
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_structural_indexer indexer(parser.structural_indexes.get());
|
|
|
|
// Read all but the last block
|
|
while (reader.has_full_block()) {
|
|
indexer.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
// Take care of the last block (will always be there unless file is empty which is
|
|
// not supposed to happen.)
|
|
uint8_t block[STEP_SIZE];
|
|
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
|
|
indexer.step<STEP_SIZE>(block, reader);
|
|
return indexer.finish(parser, reader.block_index(), len, partial);
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
simd::simd8x64<uint8_t> in_2(block+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
this->next(in_2, block_2, reader.block_index()+64);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
reader.advance();
|
|
}
|
|
|
|
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
|
|
uint64_t unescaped = in.lteq(0x1F);
|
|
#if SIMDJSON_UTF8VALIDATION
|
|
checker.check_next_input(in);
|
|
#endif
|
|
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
|
|
prev_structurals = block.structural_start();
|
|
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
|
}
|
|
|
|
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
|
// Write out the final iteration's structurals
|
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
|
error_code error = scanner.finish();
|
|
// We deliberately break down the next expression so that it is
|
|
// human readable.
|
|
const bool should_we_exit = is_streaming(partial) ?
|
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
|
|
|
if (unescaped_chars_error) {
|
|
return UNESCAPED_CHARS;
|
|
}
|
|
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
|
/***
|
|
* The On Demand API requires special padding.
|
|
*
|
|
* This is related to https://github.com/simdjson/simdjson/issues/906
|
|
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
|
* structural character, it quickly stops.
|
|
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
|
|
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
|
|
* continues, then it must be [,] or }.
|
|
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
|
|
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
|
|
* It can't be a comma, a colon or any simple value. So the only way we could continue is
|
|
* if the repeated character is [. But if so, the document must start with [. But if the document
|
|
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
|
* ][[ which is invalid.
|
|
*
|
|
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
|
* R"({ "a": [,,)"
|
|
**/
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
|
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
|
parser.next_structural_index = 0;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
return EMPTY;
|
|
}
|
|
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
|
return UNEXPECTED_ERROR;
|
|
}
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
// If we have an unclosed string, then the last structural
|
|
// will be the quote and we want to make sure to omit it.
|
|
if(have_unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if (partial == stage1_mode::streaming_final) {
|
|
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
|
// often load their data in bulk without being careful and they want us to ignore
|
|
// the trailing garbage.
|
|
return EMPTY;
|
|
}
|
|
}
|
|
checker.check_eof();
|
|
return checker.errors();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
/* end file generic/stage1/json_structural_indexer.h for westmere */
|
|
/* including generic/stage1/utf8_validator.h for westmere: #include <generic/stage1/utf8_validator.h> */
|
|
/* begin file generic/stage1/utf8_validator.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return c.errors() == error_code::SUCCESS;
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
/* end file generic/stage1/utf8_validator.h for westmere */
|
|
/* end file generic/stage1/amalgamated.h for westmere */
|
|
/* including generic/stage2/amalgamated.h for westmere: #include <generic/stage2/amalgamated.h> */
|
|
/* begin file generic/stage2/amalgamated.h for westmere */
|
|
// Stuff other things depend on
|
|
/* including generic/stage2/base.h for westmere: #include <generic/stage2/base.h> */
|
|
/* begin file generic/stage2/base.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator;
|
|
class structural_iterator;
|
|
struct tape_builder;
|
|
struct tape_writer;
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
/* end file generic/stage2/base.h for westmere */
|
|
/* including generic/stage2/tape_writer.h for westmere: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for westmere */
|
|
/* including generic/stage2/logger.h for westmere: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for westmere */
|
|
|
|
// All other declarations
|
|
/* including generic/stage2/json_iterator.h for westmere: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for westmere */
|
|
/* including generic/stage2/stringparsing.h for westmere: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for westmere */
|
|
/* including generic/stage2/structural_iterator.h for westmere: #include <generic/stage2/structural_iterator.h> */
|
|
/* begin file generic/stage2/structural_iterator.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class structural_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
|
|
// Start a structural
|
|
simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
// Get the buffer position of the current structural character
|
|
simdjson_inline const uint8_t* current() {
|
|
return &buf[*(next_structural-1)];
|
|
}
|
|
// Get the current structural character
|
|
simdjson_inline char current_char() {
|
|
return buf[*(next_structural-1)];
|
|
}
|
|
// Get the next structural character without advancing
|
|
simdjson_inline char peek_next_char() {
|
|
return buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* peek() {
|
|
return &buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* advance() {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline char advance_char() {
|
|
return buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t remaining_len() {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool at_end() {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool at_beginning() {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
};
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
/* end file generic/stage2/structural_iterator.h for westmere */
|
|
/* including generic/stage2/tape_builder.h for westmere: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for westmere */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for westmere */
|
|
/* end file generic/stage2/amalgamated.h for westmere */
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
|
// These lookups rely on the fact that anything < 127 will match the lower 4 bits, which is why
|
|
// we can't use the generic lookup_16.
|
|
auto whitespace_table = simd8<uint8_t>::repeat_16(' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100);
|
|
|
|
// The 6 operators (:,[]{}) have these values:
|
|
//
|
|
// , 2C
|
|
// : 3A
|
|
// [ 5B
|
|
// { 7B
|
|
// ] 5D
|
|
// } 7D
|
|
//
|
|
// If you use | 0x20 to turn [ and ] into { and }, the lower 4 bits of each character is unique.
|
|
// We exploit this, using a simd 4-bit lookup to tell us which character match against, and then
|
|
// match it (against | 0x20).
|
|
//
|
|
// To prevent recognizing other characters, everything else gets compared with 0, which cannot
|
|
// match due to the | 0x20.
|
|
//
|
|
// NOTE: Due to the | 0x20, this ALSO treats <FF> and <SUB> (control characters 0C and 1A) like ,
|
|
// and :. This gets caught in stage 2, which checks the actual character to ensure the right
|
|
// operators are in the right places.
|
|
const auto op_table = simd8<uint8_t>::repeat_16(
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0,
|
|
0, 0, ':', '{', // : = 3A, [ = 5B, { = 7B
|
|
',', '}', 0, 0 // , = 2C, ] = 5D, } = 7D
|
|
);
|
|
|
|
// We compute whitespace and op separately. If the code later only use one or the
|
|
// other, given the fact that all functions are aggressively inlined, we can
|
|
// hope that useless computations will be omitted. This is namely case when
|
|
// minifying (we only need whitespace).
|
|
|
|
|
|
const uint64_t whitespace = in.eq({
|
|
_mm_shuffle_epi8(whitespace_table, in.chunks[0]),
|
|
_mm_shuffle_epi8(whitespace_table, in.chunks[1]),
|
|
_mm_shuffle_epi8(whitespace_table, in.chunks[2]),
|
|
_mm_shuffle_epi8(whitespace_table, in.chunks[3])
|
|
});
|
|
// Turn [ and ] into { and }
|
|
const simd8x64<uint8_t> curlified{
|
|
in.chunks[0] | 0x20,
|
|
in.chunks[1] | 0x20,
|
|
in.chunks[2] | 0x20,
|
|
in.chunks[3] | 0x20
|
|
};
|
|
const uint64_t op = curlified.eq({
|
|
_mm_shuffle_epi8(op_table, in.chunks[0]),
|
|
_mm_shuffle_epi8(op_table, in.chunks[1]),
|
|
_mm_shuffle_epi8(op_table, in.chunks[2]),
|
|
_mm_shuffle_epi8(op_table, in.chunks[3])
|
|
});
|
|
return { whitespace, op };
|
|
}
|
|
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
|
|
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
|
|
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
|
|
}
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
|
|
return is_third_byte | is_fourth_byte;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
|
|
namespace simdjson {
|
|
namespace westmere {
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
|
|
}
|
|
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return westmere::stage1::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
|
|
return westmere::stringparsing::parse_string(src, dst, replacement_char);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return westmere::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace westmere
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/westmere/end.h: #include <simdjson/westmere/end.h> */
|
|
/* begin file simdjson/westmere/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#if !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
|
|
SIMDJSON_UNTARGET_REGION
|
|
#endif
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "westmere" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/westmere/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_WESTMERE_CPP
|
|
/* end file westmere.cpp */
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_LSX
|
|
/* including lsx.cpp: #include <lsx.cpp> */
|
|
/* begin file lsx.cpp */
|
|
#ifndef SIMDJSON_SRC_LSX_CPP
|
|
#define SIMDJSON_SRC_LSX_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/lsx.h: #include <simdjson/lsx.h> */
|
|
/* begin file simdjson/lsx.h */
|
|
#ifndef SIMDJSON_LSX_H
|
|
#define SIMDJSON_LSX_H
|
|
|
|
/* including simdjson/lsx/begin.h: #include "simdjson/lsx/begin.h" */
|
|
/* begin file simdjson/lsx/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "lsx" */
|
|
#define SIMDJSON_IMPLEMENTATION lsx
|
|
/* including simdjson/lsx/base.h: #include "simdjson/lsx/base.h" */
|
|
/* begin file simdjson/lsx/base.h */
|
|
#ifndef SIMDJSON_LSX_BASE_H
|
|
#define SIMDJSON_LSX_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for LSX.
|
|
*/
|
|
namespace lsx {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_BASE_H
|
|
/* end file simdjson/lsx/base.h */
|
|
/* including simdjson/lsx/intrinsics.h: #include "simdjson/lsx/intrinsics.h" */
|
|
/* begin file simdjson/lsx/intrinsics.h */
|
|
#ifndef SIMDJSON_LSX_INTRINSICS_H
|
|
#define SIMDJSON_LSX_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <lsxintrin.h>
|
|
|
|
static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for LoongArch SX");
|
|
|
|
#endif // SIMDJSON_LSX_INTRINSICS_H
|
|
/* end file simdjson/lsx/intrinsics.h */
|
|
/* including simdjson/lsx/bitmanipulation.h: #include "simdjson/lsx/bitmanipulation.h" */
|
|
/* begin file simdjson/lsx/bitmanipulation.h */
|
|
#ifndef SIMDJSON_LSX_BITMANIPULATION_H
|
|
#define SIMDJSON_LSX_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/bitmask.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
return __builtin_ctzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return __builtin_clzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__m128i(v2u64{input_num, 0})), 0);
|
|
}
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_BITMANIPULATION_H
|
|
/* end file simdjson/lsx/bitmanipulation.h */
|
|
/* including simdjson/lsx/bitmask.h: #include "simdjson/lsx/bitmask.h" */
|
|
/* begin file simdjson/lsx/bitmask.h */
|
|
#ifndef SIMDJSON_LSX_BITMASK_H
|
|
#define SIMDJSON_LSX_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/lsx/bitmask.h */
|
|
/* including simdjson/lsx/numberparsing_defs.h: #include "simdjson/lsx/numberparsing_defs.h" */
|
|
/* begin file simdjson/lsx/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_LSX_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_LSX_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace numberparsing {
|
|
|
|
// we don't have appropriate instructions, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_LSX_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/lsx/numberparsing_defs.h */
|
|
/* including simdjson/lsx/simd.h: #include "simdjson/lsx/simd.h" */
|
|
/* begin file simdjson/lsx/simd.h */
|
|
#ifndef SIMDJSON_LSX_SIMD_H
|
|
#define SIMDJSON_LSX_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i&() const { return this->value; }
|
|
simdjson_inline operator __m128i&() { return this->value; }
|
|
simdjson_inline operator const v16i8&() const { return (v16i8&)this->value; }
|
|
simdjson_inline operator v16i8&() { return (v16i8&)this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return __lsx_vor_v(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return __lsx_vand_v(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return __lsx_vxor_v(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return __lsx_vandn_v(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return __lsx_vseq_b(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return __lsx_vor_v(__lsx_vbsll_v(*this, N), __lsx_vbsrl_v(prev_chunk, 16 - N));
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) {
|
|
return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); }
|
|
simdjson_inline bool any() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return __lsx_vreplgr2vr_b(_value); }
|
|
static simdjson_inline simd8<T> zero() { return __lsx_vldi(0); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return __lsx_vld(reinterpret_cast<const __m128i *>(values), 0);
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const {
|
|
return __lsx_vst(*this, reinterpret_cast<__m128i *>(dst), 0);
|
|
}
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return __lsx_vadd_b(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return __lsx_vsub_b(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return __lsx_vshuf_b(lookup_table, lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by haswell
|
|
// lsx do it in 2 steps, first 8 bytes and then second 8 bytes...
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register.
|
|
__m128i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808};
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = __lsx_vshuf_b(*this, *this, shufmask);
|
|
// we still need to put the pieces back together.
|
|
// we compute the popcount of the first words:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask
|
|
__m128i compactmask = __lsx_vldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop1 * 8);
|
|
__m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
|
|
__lsx_vst(answer, reinterpret_cast<uint8_t*>(output), 0);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[16]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8({
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return __lsx_vmax_b(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return __lsx_vmin_b(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return __lsx_vslt_b(other, *this); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return __lsx_vslt_b(*this, other); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(__m128i(v16u8{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
})) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return __lsx_vsadd_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return __lsx_vssub_bu(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return __lsx_vmax_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return __lsx_vmin_bu(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return 0 == __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
|
|
return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(__lsx_vand_v(*this, bits)), 0);
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(__lsx_vsrli_b(*this, N)); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(__lsx_vslli_b(*this, N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "LSX kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint16_t mask1 = uint16_t(mask);
|
|
uint16_t mask2 = uint16_t(mask >> 16);
|
|
uint16_t mask3 = uint16_t(mask >> 32);
|
|
uint16_t mask4 = uint16_t(mask >> 48);
|
|
__m128i zcnt = __lsx_vpcnt_h(__m128i(v2u64{~mask, 0}));
|
|
uint64_t zcnt1 = __lsx_vpickve2gr_hu(zcnt, 0);
|
|
uint64_t zcnt2 = __lsx_vpickve2gr_hu(zcnt, 1);
|
|
uint64_t zcnt3 = __lsx_vpickve2gr_hu(zcnt, 2);
|
|
uint64_t zcnt4 = __lsx_vpickve2gr_hu(zcnt, 3);
|
|
uint8_t *voutput = reinterpret_cast<uint8_t*>(output);
|
|
// There should be a critical value which processes in scaler is faster.
|
|
if (zcnt1)
|
|
this->chunks[0].compress(mask1, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt1;
|
|
if (zcnt2)
|
|
this->chunks[1].compress(mask2, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt2;
|
|
if (zcnt3)
|
|
this->chunks[2].compress(mask3, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt3;
|
|
if (zcnt4)
|
|
this->chunks[3].compress(mask4, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt4;
|
|
return reinterpret_cast<uint64_t>(voutput) - reinterpret_cast<uint64_t>(output);
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
__m128i mask1 = __lsx_vmskltz_b(this->chunks[0]);
|
|
__m128i mask2 = __lsx_vmskltz_b(this->chunks[1]);
|
|
__m128i mask3 = __lsx_vmskltz_b(this->chunks[2]);
|
|
__m128i mask4 = __lsx_vmskltz_b(this->chunks[3]);
|
|
mask1 = __lsx_vilvl_h(mask2, mask1);
|
|
mask2 = __lsx_vilvl_h(mask4, mask3);
|
|
return __lsx_vpickve2gr_du(__lsx_vilvl_w(mask2, mask1), 0);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_SIMD_H
|
|
/* end file simdjson/lsx/simd.h */
|
|
/* including simdjson/lsx/stringparsing_defs.h: #include "simdjson/lsx/stringparsing_defs.h" */
|
|
/* begin file simdjson/lsx/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_LSX_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_LSX_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + sizeof(v0));
|
|
v0.store(dst);
|
|
v1.store(dst + sizeof(v0));
|
|
|
|
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on LSX; therefore, we
|
|
// smash them together into a 64-byte mask and get the bitmask from there.
|
|
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/lsx/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/lsx/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for lsx: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for lsx */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for lsx: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for lsx */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for lsx */
|
|
/* including simdjson/generic/jsoncharutils.h for lsx: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for lsx */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for lsx */
|
|
/* including simdjson/generic/atomparsing.h for lsx: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for lsx */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for lsx */
|
|
/* including simdjson/generic/dom_parser_implementation.h for lsx: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for lsx */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for lsx */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for lsx: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for lsx */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for lsx */
|
|
/* including simdjson/generic/numberparsing.h for lsx: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for lsx */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for lsx */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for lsx: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for lsx */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for lsx */
|
|
/* end file simdjson/generic/amalgamated.h for lsx */
|
|
/* including simdjson/lsx/end.h: #include "simdjson/lsx/end.h" */
|
|
/* begin file simdjson/lsx/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "lsx" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/lsx/end.h */
|
|
|
|
#endif // SIMDJSON_LSX_H
|
|
/* end file simdjson/lsx.h */
|
|
/* including simdjson/lsx/implementation.h: #include <simdjson/lsx/implementation.h> */
|
|
/* begin file simdjson/lsx/implementation.h */
|
|
#ifndef SIMDJSON_LSX_IMPLEMENTATION_H
|
|
#define SIMDJSON_LSX_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("lsx", "LoongArch SX", internal::instruction_set::LSX) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_IMPLEMENTATION_H
|
|
/* end file simdjson/lsx/implementation.h */
|
|
|
|
/* including simdjson/lsx/begin.h: #include <simdjson/lsx/begin.h> */
|
|
/* begin file simdjson/lsx/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "lsx" */
|
|
#define SIMDJSON_IMPLEMENTATION lsx
|
|
/* including simdjson/lsx/base.h: #include "simdjson/lsx/base.h" */
|
|
/* begin file simdjson/lsx/base.h */
|
|
#ifndef SIMDJSON_LSX_BASE_H
|
|
#define SIMDJSON_LSX_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for LSX.
|
|
*/
|
|
namespace lsx {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_BASE_H
|
|
/* end file simdjson/lsx/base.h */
|
|
/* including simdjson/lsx/intrinsics.h: #include "simdjson/lsx/intrinsics.h" */
|
|
/* begin file simdjson/lsx/intrinsics.h */
|
|
#ifndef SIMDJSON_LSX_INTRINSICS_H
|
|
#define SIMDJSON_LSX_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <lsxintrin.h>
|
|
|
|
static_assert(sizeof(__m128i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for LoongArch SX");
|
|
|
|
#endif // SIMDJSON_LSX_INTRINSICS_H
|
|
/* end file simdjson/lsx/intrinsics.h */
|
|
/* including simdjson/lsx/bitmanipulation.h: #include "simdjson/lsx/bitmanipulation.h" */
|
|
/* begin file simdjson/lsx/bitmanipulation.h */
|
|
#ifndef SIMDJSON_LSX_BITMANIPULATION_H
|
|
#define SIMDJSON_LSX_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/bitmask.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
return __builtin_ctzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return __builtin_clzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__m128i(v2u64{input_num, 0})), 0);
|
|
}
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_BITMANIPULATION_H
|
|
/* end file simdjson/lsx/bitmanipulation.h */
|
|
/* including simdjson/lsx/bitmask.h: #include "simdjson/lsx/bitmask.h" */
|
|
/* begin file simdjson/lsx/bitmask.h */
|
|
#ifndef SIMDJSON_LSX_BITMASK_H
|
|
#define SIMDJSON_LSX_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/lsx/bitmask.h */
|
|
/* including simdjson/lsx/numberparsing_defs.h: #include "simdjson/lsx/numberparsing_defs.h" */
|
|
/* begin file simdjson/lsx/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_LSX_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_LSX_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace numberparsing {
|
|
|
|
// we don't have appropriate instructions, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_LSX_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/lsx/numberparsing_defs.h */
|
|
/* including simdjson/lsx/simd.h: #include "simdjson/lsx/simd.h" */
|
|
/* begin file simdjson/lsx/simd.h */
|
|
#ifndef SIMDJSON_LSX_SIMD_H
|
|
#define SIMDJSON_LSX_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m128i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m128i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m128i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m128i&() const { return this->value; }
|
|
simdjson_inline operator __m128i&() { return this->value; }
|
|
simdjson_inline operator const v16i8&() const { return (v16i8&)this->value; }
|
|
simdjson_inline operator v16i8&() { return (v16i8&)this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return __lsx_vor_v(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return __lsx_vand_v(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return __lsx_vxor_v(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return __lsx_vandn_v(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return __lsx_vseq_b(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
return __lsx_vor_v(__lsx_vbsll_v(*this, N), __lsx_vbsrl_v(prev_chunk, 16 - N));
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) {
|
|
return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
|
|
}
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const { return __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); }
|
|
simdjson_inline bool any() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); }
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) { return __lsx_vreplgr2vr_b(_value); }
|
|
static simdjson_inline simd8<T> zero() { return __lsx_vldi(0); }
|
|
static simdjson_inline simd8<T> load(const T values[16]) {
|
|
return __lsx_vld(reinterpret_cast<const __m128i *>(values), 0);
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[16]) const {
|
|
return __lsx_vst(*this, reinterpret_cast<__m128i *>(dst), 0);
|
|
}
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return __lsx_vadd_b(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return __lsx_vsub_b(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return __lsx_vshuf_b(lookup_table, lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint16_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by haswell
|
|
// lsx do it in 2 steps, first 8 bytes and then second 8 bytes...
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // second least significant 8 bits
|
|
// next line just loads the 64-bit values thintable_epi8[mask1] and
|
|
// thintable_epi8[mask2] into a 128-bit register.
|
|
__m128i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808};
|
|
// this is the version "nearly pruned"
|
|
__m128i pruned = __lsx_vshuf_b(*this, *this, shufmask);
|
|
// we still need to put the pieces back together.
|
|
// we compute the popcount of the first words:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
// then load the corresponding mask
|
|
__m128i compactmask = __lsx_vldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop1 * 8);
|
|
__m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
|
|
__lsx_vst(answer, reinterpret_cast<uint8_t*>(output), 0);
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[16]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) : simd8({
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return __lsx_vmax_b(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return __lsx_vmin_b(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return __lsx_vslt_b(other, *this); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return __lsx_vslt_b(*this, other); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) : simd8(__m128i(v16u8{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
})) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return __lsx_vsadd_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return __lsx_vssub_bu(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return __lsx_vmax_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return __lsx_vmin_bu(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const { return 0 == __lsx_vpickve2gr_w(__lsx_vmskltz_b(*this), 0); }
|
|
simdjson_inline bool bits_not_set_anywhere() const { return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0); }
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
|
|
return 0 == __lsx_vpickve2gr_hu(__lsx_vmsknz_b(__lsx_vand_v(*this, bits)), 0);
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(__lsx_vsrli_b(*this, N)); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(__lsx_vslli_b(*this, N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 4, "LSX kernel should use four registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+16), simd8<T>::load(ptr+32), simd8<T>::load(ptr+48)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint16_t mask1 = uint16_t(mask);
|
|
uint16_t mask2 = uint16_t(mask >> 16);
|
|
uint16_t mask3 = uint16_t(mask >> 32);
|
|
uint16_t mask4 = uint16_t(mask >> 48);
|
|
__m128i zcnt = __lsx_vpcnt_h(__m128i(v2u64{~mask, 0}));
|
|
uint64_t zcnt1 = __lsx_vpickve2gr_hu(zcnt, 0);
|
|
uint64_t zcnt2 = __lsx_vpickve2gr_hu(zcnt, 1);
|
|
uint64_t zcnt3 = __lsx_vpickve2gr_hu(zcnt, 2);
|
|
uint64_t zcnt4 = __lsx_vpickve2gr_hu(zcnt, 3);
|
|
uint8_t *voutput = reinterpret_cast<uint8_t*>(output);
|
|
// There should be a critical value which processes in scaler is faster.
|
|
if (zcnt1)
|
|
this->chunks[0].compress(mask1, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt1;
|
|
if (zcnt2)
|
|
this->chunks[1].compress(mask2, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt2;
|
|
if (zcnt3)
|
|
this->chunks[2].compress(mask3, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt3;
|
|
if (zcnt4)
|
|
this->chunks[3].compress(mask4, reinterpret_cast<T*>(voutput));
|
|
voutput += zcnt4;
|
|
return reinterpret_cast<uint64_t>(voutput) - reinterpret_cast<uint64_t>(output);
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
this->chunks[2].store(ptr+sizeof(simd8<T>)*2);
|
|
this->chunks[3].store(ptr+sizeof(simd8<T>)*3);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
__m128i mask1 = __lsx_vmskltz_b(this->chunks[0]);
|
|
__m128i mask2 = __lsx_vmskltz_b(this->chunks[1]);
|
|
__m128i mask3 = __lsx_vmskltz_b(this->chunks[2]);
|
|
__m128i mask4 = __lsx_vmskltz_b(this->chunks[3]);
|
|
mask1 = __lsx_vilvl_h(mask2, mask1);
|
|
mask2 = __lsx_vilvl_h(mask4, mask3);
|
|
return __lsx_vpickve2gr_du(__lsx_vilvl_w(mask2, mask1), 0);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask,
|
|
this->chunks[2] == mask,
|
|
this->chunks[3] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1],
|
|
this->chunks[2] == other.chunks[2],
|
|
this->chunks[3] == other.chunks[3]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask,
|
|
this->chunks[2] <= mask,
|
|
this->chunks[3] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_SIMD_H
|
|
/* end file simdjson/lsx/simd.h */
|
|
/* including simdjson/lsx/stringparsing_defs.h: #include "simdjson/lsx/stringparsing_defs.h" */
|
|
/* begin file simdjson/lsx/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_LSX_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_LSX_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v0(src);
|
|
simd8<uint8_t> v1(src + sizeof(v0));
|
|
v0.store(dst);
|
|
v1.store(dst + sizeof(v0));
|
|
|
|
// Getting a 64-bit bitmask is much cheaper than multiple 16-bit bitmasks on LSX; therefore, we
|
|
// smash them together into a 64-byte mask and get the bitmask from there.
|
|
uint64_t bs_and_quote = simd8x64<bool>(v0 == '\\', v1 == '\\', v0 == '"', v1 == '"').to_bitmask();
|
|
return {
|
|
uint32_t(bs_and_quote), // bs_bits
|
|
uint32_t(bs_and_quote >> 32) // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LSX_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/lsx/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/lsx/begin.h */
|
|
/* including generic/amalgamated.h for lsx: #include <generic/amalgamated.h> */
|
|
/* begin file generic/amalgamated.h for lsx */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H)
|
|
#error generic/dependencies.h must be included before generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including generic/base.h for lsx: #include <generic/base.h> */
|
|
/* begin file generic/base.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
struct json_character_block;
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_BASE_H
|
|
/* end file generic/base.h for lsx */
|
|
/* including generic/dom_parser_implementation.h for lsx: #include <generic/dom_parser_implementation.h> */
|
|
/* begin file generic/dom_parser_implementation.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// Interface a dom parser implementation must fulfill
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3);
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input);
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file generic/dom_parser_implementation.h for lsx */
|
|
/* including generic/json_character_block.h for lsx: #include <generic/json_character_block.h> */
|
|
/* begin file generic/json_character_block.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
|
|
struct json_character_block {
|
|
static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
|
|
|
simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
|
|
simdjson_inline uint64_t op() const noexcept { return _op; }
|
|
simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
|
|
|
|
uint64_t _whitespace;
|
|
uint64_t _op;
|
|
};
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
/* end file generic/json_character_block.h for lsx */
|
|
/* end file generic/amalgamated.h for lsx */
|
|
/* including generic/stage1/amalgamated.h for lsx: #include <generic/stage1/amalgamated.h> */
|
|
/* begin file generic/stage1/amalgamated.h for lsx */
|
|
// Stuff other things depend on
|
|
/* including generic/stage1/base.h for lsx: #include <generic/stage1/base.h> */
|
|
/* begin file generic/stage1/base.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer;
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader;
|
|
struct json_block;
|
|
class json_minifier;
|
|
class json_scanner;
|
|
struct json_string_block;
|
|
class json_string_scanner;
|
|
class json_structural_indexer;
|
|
|
|
} // namespace stage1
|
|
|
|
namespace utf8_validation {
|
|
struct utf8_checker;
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
/* end file generic/stage1/base.h for lsx */
|
|
/* including generic/stage1/buf_block_reader.h for lsx: #include <generic/stage1/buf_block_reader.h> */
|
|
/* begin file generic/stage1/buf_block_reader.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdjson_inline size_t block_index();
|
|
simdjson_inline bool has_full_block() const;
|
|
simdjson_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdjson_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdjson_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in, uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] <= ' ') { buf[i] = '_'; }
|
|
if (!(mask & (size_t(1) << i))) { buf[i] = ' '; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_mask(uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
/* end file generic/stage1/buf_block_reader.h for lsx */
|
|
/* including generic/stage1/json_escape_scanner.h for lsx: #include <generic/stage1/json_escape_scanner.h> */
|
|
/* begin file generic/stage1/json_escape_scanner.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n).
|
|
*/
|
|
struct json_escape_scanner {
|
|
/** The actual escape characters (the backslashes themselves). */
|
|
uint64_t next_is_escaped = 0ULL;
|
|
|
|
struct escaped_and_escape {
|
|
/**
|
|
* Mask of escaped characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 0100100010100101000
|
|
* n \ \ n \ \
|
|
* ```
|
|
*/
|
|
uint64_t escaped;
|
|
/**
|
|
* Mask of escape characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 1001000101001010001
|
|
* \ \ \ \ \ \ \
|
|
* ```
|
|
*/
|
|
uint64_t escape;
|
|
};
|
|
|
|
/**
|
|
* Get a mask of both escape and escaped characters (the characters following a backslash).
|
|
*
|
|
* @param potential_escape A mask of the character that can escape others (but could be
|
|
* escaped itself). e.g. block.eq('\\')
|
|
*/
|
|
simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept {
|
|
|
|
#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
if (!backslash) { return {next_escaped_without_backslashes(), 0}; }
|
|
#endif
|
|
|
|
// | | Mask (shows characters instead of 1's) | Depth | Instructions |
|
|
// |--------------------------------|----------------------------------------|-------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | |
|
|
// | | ` even odd even odd odd` | | |
|
|
// | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code())
|
|
// | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped))
|
|
// | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash)
|
|
// | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) ()
|
|
// (*) this is not needed until the next iteration
|
|
uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped);
|
|
uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped);
|
|
uint64_t escape = escape_and_terminal_code & backslash;
|
|
this->next_is_escaped = escape >> 63;
|
|
return {escaped, escape};
|
|
}
|
|
|
|
private:
|
|
static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL;
|
|
|
|
simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept {
|
|
uint64_t escaped = this->next_is_escaped;
|
|
this->next_is_escaped = 0;
|
|
return escaped;
|
|
}
|
|
|
|
/**
|
|
* Returns a mask of the next escape characters (masking out escaped backslashes), along with
|
|
* any non-backslash escape codes.
|
|
*
|
|
* \n \\n \\\n \\\\n returns:
|
|
* \n \ \ \n \ \
|
|
* 11 100 1011 10100
|
|
*
|
|
* You are expected to mask out the first bit yourself if the previous block had a trailing
|
|
* escape.
|
|
*
|
|
* & the result with potential_escape to get just the escape characters.
|
|
* ^ the result with (potential_escape | first_is_escaped) to get escaped characters.
|
|
*/
|
|
static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept {
|
|
// If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer:
|
|
// any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be
|
|
// inverted (\\\ would be 010 instead of 101).
|
|
//
|
|
// ```
|
|
// string: | ____\\\\_\\\\_____ |
|
|
// maybe_escaped | ODD | \ \ \ \ |
|
|
// even-aligned ^^^ ^^^^ odd-aligned
|
|
// ```
|
|
//
|
|
// Taking that into account, our basic strategy is:
|
|
//
|
|
// 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for
|
|
// odd-aligned runs.
|
|
// 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the
|
|
// odd bits in odd-aligned runs.
|
|
// 3. & with backslash to clean up any stray bits.
|
|
// runs are set to 0, and then XORing with "odd":
|
|
//
|
|
// | | Mask (shows characters instead of 1's) | Instructions |
|
|
// |--------------------------------|----------------------------------------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` |
|
|
// | | ` even odd even odd odd` |
|
|
// | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1)
|
|
// | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd)
|
|
// | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd)
|
|
//
|
|
|
|
// Escaped characters are characters following an escape.
|
|
uint64_t maybe_escaped = potential_escape << 1;
|
|
|
|
// To distinguish odd from even escape sequences, therefore, we turn on any *starting*
|
|
// escapes that are on an odd byte. (We actually bring in all odd bits, for speed.)
|
|
// - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1.
|
|
// - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0.
|
|
// - All other odd bytes are 1, and even bytes are 0.
|
|
uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS;
|
|
uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape;
|
|
|
|
// Now we flip all odd bytes back with xor. This:
|
|
// - Makes odd runs of backslashes go from 0000 to 1010
|
|
// - Makes even runs of backslashes go from 1111 to 1010
|
|
// - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100)
|
|
// - Resets all other bytes to 0
|
|
return even_series_codes_and_odd_bits ^ ODD_BITS;
|
|
}
|
|
};
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_escape_scanner.h for lsx */
|
|
/* including generic/stage1/json_string_scanner.h for lsx: #include <generic/stage1/json_string_scanner.h> */
|
|
/* begin file generic/stage1/json_string_scanner.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_escape_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
struct json_string_block {
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) :
|
|
_escaped(escaped), _quote(quote), _in_string(in_string) {}
|
|
|
|
// Escaped characters (characters following an escape() character)
|
|
simdjson_really_inline uint64_t escaped() const { return _escaped; }
|
|
// Real (non-backslashed) quotes
|
|
simdjson_really_inline uint64_t quote() const { return _quote; }
|
|
// Only characters inside the string (not including the quotes)
|
|
simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
|
|
// Tail of string (everything except the start quote)
|
|
simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
|
|
|
|
// escaped characters (backslashed--does not include the hex characters after \u)
|
|
uint64_t _escaped;
|
|
// real quotes (non-escaped ones)
|
|
uint64_t _quote;
|
|
// string characters (includes start quote but not end quote)
|
|
uint64_t _in_string;
|
|
};
|
|
|
|
// Scans blocks for string characters, storing the state necessary to do so
|
|
class json_string_scanner {
|
|
public:
|
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_really_inline error_code finish();
|
|
|
|
private:
|
|
// Scans for escape characters
|
|
json_escape_scanner escape_scanner{};
|
|
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
|
uint64_t prev_in_string = 0ULL;
|
|
};
|
|
|
|
//
|
|
// Return a mask of all string characters plus end quotes.
|
|
//
|
|
// prev_escaped is overflow saying whether the next character is escaped.
|
|
// prev_in_string is overflow saying whether we're still in a string.
|
|
//
|
|
// Backslash sequences outside of quotes will be detected in stage 2.
|
|
//
|
|
simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
const uint64_t backslash = in.eq('\\');
|
|
const uint64_t escaped = escape_scanner.next(backslash).escaped;
|
|
const uint64_t quote = in.eq('"') & ~escaped;
|
|
|
|
//
|
|
// prefix_xor flips on bits inside the string (and flips off the end quote).
|
|
//
|
|
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
|
|
// (characters inside strings are outside, and characters outside strings are inside).
|
|
//
|
|
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
|
|
|
|
//
|
|
// Check if we're still in a string at the end of the box so the next block will know
|
|
//
|
|
prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
|
|
|
|
// Use ^ to turn the beginning quote off, and the end quote on.
|
|
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_string_block(escaped, quote, in_string);
|
|
}
|
|
|
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
|
if (prev_in_string) {
|
|
return UNCLOSED_STRING;
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_string_scanner.h for lsx */
|
|
/* including generic/stage1/utf8_lookup4_algorithm.h for lsx: #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* begin file generic/stage1/utf8_lookup4_algorithm.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = must_be_2_3_continuation(prev2, prev3);
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
static const uint8_t max_array[64] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#else
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#endif
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdjson_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdjson_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
|
|
||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
|
|
|| (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support one, two or four chunks per 64-byte block.");
|
|
SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdjson_inline error_code errors() {
|
|
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
/* end file generic/stage1/utf8_lookup4_algorithm.h for lsx */
|
|
/* including generic/stage1/json_scanner.h for lsx: #include <generic/stage1/json_scanner.h> */
|
|
/* begin file generic/stage1/json_scanner.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/json_character_block.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* A block of scanned json, with information on operators and scalars.
|
|
*
|
|
* We seek to identify pseudo-structural characters. Anything that is inside
|
|
* a string must be omitted (hence & ~_string.string_tail()).
|
|
* Otherwise, pseudo-structural characters come in two forms.
|
|
* 1. We have the structural characters ([,],{,},:, comma). The
|
|
* term 'structural character' is from the JSON RFC.
|
|
* 2. We have the 'scalar pseudo-structural characters'.
|
|
* Scalars are quotes, and any character except structural characters and white space.
|
|
*
|
|
* To identify the scalar pseudo-structural characters, we must look at what comes
|
|
* before them: it must be a space, a quote or a structural characters.
|
|
* Starting with simdjson v0.3, we identify them by
|
|
* negation: we identify everything that is followed by a non-quote scalar,
|
|
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
|
*/
|
|
struct json_block {
|
|
public:
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
|
|
/**
|
|
* The start of structurals.
|
|
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
|
**/
|
|
simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
|
|
/** All JSON whitespace (i.e. not in a string) */
|
|
simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
|
|
|
|
// Helpers
|
|
|
|
/** Whether the given characters are inside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
|
|
/** Whether the given characters are outside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
|
|
|
|
// string and escape characters
|
|
json_string_block _string;
|
|
// whitespace, structural characters ('operators'), scalars
|
|
json_character_block _characters;
|
|
// whether the previous character was a scalar
|
|
uint64_t _follows_potential_nonquote_scalar;
|
|
private:
|
|
// Potential structurals (i.e. disregarding strings)
|
|
|
|
/**
|
|
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
|
* They may reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
|
|
/**
|
|
* The start of non-operator runs, like 123, true and "abc".
|
|
* It main reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_scalar_start() const noexcept {
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
|
// then we know that it is irrelevant structurally.
|
|
return _characters.scalar() & ~follows_potential_scalar();
|
|
}
|
|
/**
|
|
* Whether the given character is immediately after a non-operator like 123, true.
|
|
* The characters following a quote are not included.
|
|
*/
|
|
simdjson_inline uint64_t follows_potential_scalar() const noexcept {
|
|
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
|
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
|
// white space.
|
|
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
|
return _follows_potential_nonquote_scalar;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
|
*
|
|
* The scanner starts by calculating two distinct things:
|
|
* - string characters (taking \" into account)
|
|
* - structural characters or 'operators' ([]{},:, comma)
|
|
* and scalars (runs of non-operators like 123, true and "abc")
|
|
*
|
|
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
|
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
|
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
|
|
* part of a string.
|
|
*/
|
|
class json_scanner {
|
|
public:
|
|
json_scanner() = default;
|
|
simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_inline error_code finish();
|
|
|
|
private:
|
|
// Whether the last character of the previous iteration is part of a scalar token
|
|
// (anything except whitespace or a structural character/'operator').
|
|
uint64_t prev_scalar = 0ULL;
|
|
json_string_scanner string_scanner{};
|
|
};
|
|
|
|
|
|
//
|
|
// Check if the current character immediately follows a matching character.
|
|
//
|
|
// For example, this checks for quotes with backslashes in front of them:
|
|
//
|
|
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
|
//
|
|
simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|
const uint64_t result = match << 1 | overflow;
|
|
overflow = match >> 63;
|
|
return result;
|
|
}
|
|
|
|
simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
json_string_block strings = string_scanner.next(in);
|
|
// identifies the white-space and the structural characters
|
|
json_character_block characters = json_character_block::classify(in);
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
|
//
|
|
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
|
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
|
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
|
// may need to add an extra check when parsing strings.
|
|
//
|
|
// Performance: there are many ways to skin this cat.
|
|
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
|
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_block(
|
|
strings,// strings is a function-local object so either it moves or the copy is elided.
|
|
characters,
|
|
follows_nonquote_scalar
|
|
);
|
|
}
|
|
|
|
simdjson_inline error_code json_scanner::finish() {
|
|
return string_scanner.finish();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
/* end file generic/stage1/json_scanner.h for lsx */
|
|
|
|
// All other declarations
|
|
/* including generic/stage1/find_next_document_index.h for lsx: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for lsx */
|
|
/* including generic/stage1/json_minifier.h for lsx: #include <generic/stage1/json_minifier.h> */
|
|
/* begin file generic/stage1/json_minifier.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class json_minifier {
|
|
public:
|
|
template<size_t STEP_SIZE>
|
|
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_minifier(uint8_t *_dst)
|
|
: dst{_dst}
|
|
{}
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
|
|
simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
|
|
json_scanner scanner{};
|
|
uint8_t *dst;
|
|
};
|
|
|
|
simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
|
uint64_t mask = block.whitespace();
|
|
dst += in.compress(mask, dst);
|
|
}
|
|
|
|
simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
|
error_code error = scanner.finish();
|
|
if (error) { dst_len = 0; return error; }
|
|
dst_len = dst - dst_start;
|
|
return SUCCESS;
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
simd::simd8x64<uint8_t> in_2(block_buf+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1);
|
|
this->next(in_2, block_2);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(block_buf, block_1);
|
|
reader.advance();
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_minifier minifier(dst);
|
|
|
|
// Index the first n-1 blocks
|
|
while (reader.has_full_block()) {
|
|
minifier.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
|
|
// Index the last (remainder) block, padded with spaces
|
|
uint8_t block[STEP_SIZE];
|
|
size_t remaining_bytes = reader.get_remainder(block);
|
|
if (remaining_bytes > 0) {
|
|
// We do not want to write directly to the output stream. Rather, we write
|
|
// to a local buffer (for safety).
|
|
uint8_t out_block[STEP_SIZE];
|
|
uint8_t * const guarded_dst{minifier.dst};
|
|
minifier.dst = out_block;
|
|
minifier.step<STEP_SIZE>(block, reader);
|
|
size_t to_write = minifier.dst - out_block;
|
|
// In some cases, we could be enticed to consider the padded spaces
|
|
// as part of the string. This is fine as long as we do not write more
|
|
// than we consumed.
|
|
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
|
|
memcpy(guarded_dst, out_block, to_write);
|
|
minifier.dst = guarded_dst + to_write;
|
|
}
|
|
return minifier.finish(dst, dst_len);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
/* end file generic/stage1/json_minifier.h for lsx */
|
|
/* including generic/stage1/json_structural_indexer.h for lsx: #include <generic/stage1/json_structural_indexer.h> */
|
|
/* begin file generic/stage1/json_structural_indexer.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_minifier.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/find_next_document_index.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer {
|
|
public:
|
|
uint32_t *tail;
|
|
|
|
simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
/**
|
|
* ARM lacks a fast trailing zero instruction, but it has a fast
|
|
* bit reversal instruction and a fast leading zero instruction.
|
|
* Thus it may be profitable to reverse the bits (once) and then
|
|
* to rely on a sequence of instructions that call the leading
|
|
* zero instruction.
|
|
*
|
|
* Performance notes:
|
|
* The chosen routine is not optimal in terms of data dependency
|
|
* since zero_leading_bit might require two instructions. However,
|
|
* it tends to minimize the total number of instructions which is
|
|
* beneficial.
|
|
*/
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
|
|
int lz = leading_zeroes(rev_bits);
|
|
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
|
rev_bits = zero_leading_bit(rev_bits, lz);
|
|
}
|
|
#else
|
|
/**
|
|
* Under recent x64 systems, we often have both a fast trailing zero
|
|
* instruction and a fast 'clear-lower-bit' instruction so the following
|
|
* algorithm can be competitive.
|
|
*/
|
|
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
|
|
this->tail[i] = idx + trailing_zeroes(bits);
|
|
bits = clear_lowest_bit(bits);
|
|
}
|
|
#endif // SIMDJSON_PREFER_REVERSE_BITS
|
|
|
|
template <int START, int N>
|
|
simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
|
|
write_index(idx, bits, START);
|
|
SIMDJSON_IF_CONSTEXPR (N > 1) {
|
|
write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
|
|
}
|
|
return START+N;
|
|
}
|
|
|
|
template <int START, int END, int STEP>
|
|
simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
|
|
write_indexes<START, STEP>(idx, bits);
|
|
SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) {
|
|
if (simdjson_unlikely((START+STEP) < cnt)) {
|
|
write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
|
|
}
|
|
}
|
|
return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
|
|
}
|
|
|
|
// flatten out values in 'bits' assuming that they are are to have values of idx
|
|
// plus their position in the bitvector, and store these indexes at
|
|
// base_ptr[base] incrementing base as we go
|
|
// will potentially store extra values beyond end of valid bits, so base_ptr
|
|
// needs to be large enough to handle this
|
|
//
|
|
// If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it
|
|
// will provide its own version of the code.
|
|
#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits);
|
|
#else
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0)
|
|
return;
|
|
|
|
int cnt = static_cast<int>(count_ones(bits));
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
bits = reverse_bits(bits);
|
|
#endif
|
|
#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
|
|
static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
|
|
#else
|
|
static constexpr const int STEP = 4;
|
|
#endif
|
|
static constexpr const int STEP_UNTIL = 24;
|
|
|
|
write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
|
|
SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
|
|
if (simdjson_unlikely(STEP_UNTIL < cnt)) {
|
|
for (int i=STEP_UNTIL; i<cnt; i++) {
|
|
write_index(idx, bits, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
this->tail += cnt;
|
|
}
|
|
#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
};
|
|
|
|
class json_structural_indexer {
|
|
public:
|
|
/**
|
|
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
|
|
*
|
|
* @param partial Setting the partial parameter to true allows the find_structural_bits to
|
|
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
|
|
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
|
*/
|
|
template<size_t STEP_SIZE>
|
|
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
|
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
|
|
|
json_scanner scanner{};
|
|
utf8_checker checker{};
|
|
bit_indexer indexer;
|
|
uint64_t prev_structurals = 0;
|
|
uint64_t unescaped_chars_error = 0;
|
|
};
|
|
|
|
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
|
|
|
|
// Skip the last character if it is partial
|
|
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
|
|
if (simdjson_unlikely(len < 3)) {
|
|
switch (len) {
|
|
case 2:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
|
|
return len;
|
|
case 1:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return len;
|
|
case 0:
|
|
return len;
|
|
}
|
|
}
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// PERF NOTES:
|
|
// We pipe 2 inputs through these stages:
|
|
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
|
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
|
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
|
|
// The output of step 1 depends entirely on this information. These functions don't quite use
|
|
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
|
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
|
// they can make a lot of progress before they need that information.
|
|
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
|
|
// to finish: utf-8 checks and generating the output from the last iteration.
|
|
//
|
|
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
|
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
|
// workout.
|
|
//
|
|
template<size_t STEP_SIZE>
|
|
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
|
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
|
// We guard the rest of the code so that we can assume that len > 0 throughout.
|
|
if (len == 0) { return EMPTY; }
|
|
if (is_streaming(partial)) {
|
|
len = trim_partial_utf8(buf, len);
|
|
// If you end up with an empty window after trimming
|
|
// the partial UTF-8 bytes, then chances are good that you
|
|
// have an UTF-8 formatting error.
|
|
if(len == 0) { return UTF8_ERROR; }
|
|
}
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_structural_indexer indexer(parser.structural_indexes.get());
|
|
|
|
// Read all but the last block
|
|
while (reader.has_full_block()) {
|
|
indexer.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
// Take care of the last block (will always be there unless file is empty which is
|
|
// not supposed to happen.)
|
|
uint8_t block[STEP_SIZE];
|
|
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
|
|
indexer.step<STEP_SIZE>(block, reader);
|
|
return indexer.finish(parser, reader.block_index(), len, partial);
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
simd::simd8x64<uint8_t> in_2(block+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
this->next(in_2, block_2, reader.block_index()+64);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
reader.advance();
|
|
}
|
|
|
|
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
|
|
uint64_t unescaped = in.lteq(0x1F);
|
|
#if SIMDJSON_UTF8VALIDATION
|
|
checker.check_next_input(in);
|
|
#endif
|
|
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
|
|
prev_structurals = block.structural_start();
|
|
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
|
}
|
|
|
|
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
|
// Write out the final iteration's structurals
|
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
|
error_code error = scanner.finish();
|
|
// We deliberately break down the next expression so that it is
|
|
// human readable.
|
|
const bool should_we_exit = is_streaming(partial) ?
|
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
|
|
|
if (unescaped_chars_error) {
|
|
return UNESCAPED_CHARS;
|
|
}
|
|
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
|
/***
|
|
* The On Demand API requires special padding.
|
|
*
|
|
* This is related to https://github.com/simdjson/simdjson/issues/906
|
|
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
|
* structural character, it quickly stops.
|
|
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
|
|
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
|
|
* continues, then it must be [,] or }.
|
|
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
|
|
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
|
|
* It can't be a comma, a colon or any simple value. So the only way we could continue is
|
|
* if the repeated character is [. But if so, the document must start with [. But if the document
|
|
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
|
* ][[ which is invalid.
|
|
*
|
|
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
|
* R"({ "a": [,,)"
|
|
**/
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
|
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
|
parser.next_structural_index = 0;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
return EMPTY;
|
|
}
|
|
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
|
return UNEXPECTED_ERROR;
|
|
}
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
// If we have an unclosed string, then the last structural
|
|
// will be the quote and we want to make sure to omit it.
|
|
if(have_unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if (partial == stage1_mode::streaming_final) {
|
|
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
|
// often load their data in bulk without being careful and they want us to ignore
|
|
// the trailing garbage.
|
|
return EMPTY;
|
|
}
|
|
}
|
|
checker.check_eof();
|
|
return checker.errors();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
/* end file generic/stage1/json_structural_indexer.h for lsx */
|
|
/* including generic/stage1/utf8_validator.h for lsx: #include <generic/stage1/utf8_validator.h> */
|
|
/* begin file generic/stage1/utf8_validator.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return c.errors() == error_code::SUCCESS;
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
/* end file generic/stage1/utf8_validator.h for lsx */
|
|
/* end file generic/stage1/amalgamated.h for lsx */
|
|
/* including generic/stage2/amalgamated.h for lsx: #include <generic/stage2/amalgamated.h> */
|
|
/* begin file generic/stage2/amalgamated.h for lsx */
|
|
// Stuff other things depend on
|
|
/* including generic/stage2/base.h for lsx: #include <generic/stage2/base.h> */
|
|
/* begin file generic/stage2/base.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator;
|
|
class structural_iterator;
|
|
struct tape_builder;
|
|
struct tape_writer;
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
/* end file generic/stage2/base.h for lsx */
|
|
/* including generic/stage2/tape_writer.h for lsx: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for lsx */
|
|
/* including generic/stage2/logger.h for lsx: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for lsx */
|
|
|
|
// All other declarations
|
|
/* including generic/stage2/json_iterator.h for lsx: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for lsx */
|
|
/* including generic/stage2/stringparsing.h for lsx: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for lsx */
|
|
/* including generic/stage2/structural_iterator.h for lsx: #include <generic/stage2/structural_iterator.h> */
|
|
/* begin file generic/stage2/structural_iterator.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class structural_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
|
|
// Start a structural
|
|
simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
// Get the buffer position of the current structural character
|
|
simdjson_inline const uint8_t* current() {
|
|
return &buf[*(next_structural-1)];
|
|
}
|
|
// Get the current structural character
|
|
simdjson_inline char current_char() {
|
|
return buf[*(next_structural-1)];
|
|
}
|
|
// Get the next structural character without advancing
|
|
simdjson_inline char peek_next_char() {
|
|
return buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* peek() {
|
|
return &buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* advance() {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline char advance_char() {
|
|
return buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t remaining_len() {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool at_end() {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool at_beginning() {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
};
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
/* end file generic/stage2/structural_iterator.h for lsx */
|
|
/* including generic/stage2/tape_builder.h for lsx: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for lsx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for lsx */
|
|
/* end file generic/stage2/amalgamated.h for lsx */
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
|
// Inspired by haswell.
|
|
// LSX use low 5 bits as index. For the 6 operators (:,[]{}), the unique-5bits is [6:2].
|
|
// The ASCII white-space and operators have these values: (char, hex, unique-5bits)
|
|
// (' ', 20, 00000) ('\t', 09, 01001) ('\n', 0A, 01010) ('\r', 0D, 01101)
|
|
// (',', 2C, 01011) (':', 3A, 01110) ('[', 5B, 10110) ('{', 7B, 11110) (']', 5D, 10111) ('}', 7D, 11111)
|
|
const simd8<uint8_t> ws_table = simd8<uint8_t>::repeat_16(
|
|
' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0
|
|
);
|
|
const simd8<uint8_t> op_table_lo = simd8<uint8_t>::repeat_16(
|
|
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ',', 0, 0, ':', 0
|
|
);
|
|
const simd8<uint8_t> op_table_hi = simd8<uint8_t>::repeat_16(
|
|
0, 0, 0, 0, 0, 0, '[', ']', 0, 0, 0, 0, 0, 0, '{', '}'
|
|
);
|
|
uint64_t ws = in.eq({
|
|
in.chunks[0].lookup_16(ws_table),
|
|
in.chunks[1].lookup_16(ws_table),
|
|
in.chunks[2].lookup_16(ws_table),
|
|
in.chunks[3].lookup_16(ws_table)
|
|
});
|
|
uint64_t op = in.eq({
|
|
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[0].shr<2>()),
|
|
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[1].shr<2>()),
|
|
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[2].shr<2>()),
|
|
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[3].shr<2>())
|
|
});
|
|
|
|
return { ws, op };
|
|
}
|
|
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
|
|
return is_third_byte | is_fourth_byte;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdjson {
|
|
namespace lsx {
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
return lsx::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
return lsx::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
|
|
}
|
|
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return lsx::stage1::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept {
|
|
return lsx::stringparsing::parse_string(src, dst, allow_replacement);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return lsx::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace lsx
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/lsx/end.h: #include <simdjson/lsx/end.h> */
|
|
/* begin file simdjson/lsx/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "lsx" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/lsx/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_LSX_CPP
|
|
/* end file lsx.cpp */
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_LASX
|
|
/* including lasx.cpp: #include <lasx.cpp> */
|
|
/* begin file lasx.cpp */
|
|
#ifndef SIMDJSON_SRC_LASX_CPP
|
|
#define SIMDJSON_SRC_LASX_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/lasx.h: #include <simdjson/lasx.h> */
|
|
/* begin file simdjson/lasx.h */
|
|
#ifndef SIMDJSON_LASX_H
|
|
#define SIMDJSON_LASX_H
|
|
|
|
/* including simdjson/lasx/begin.h: #include "simdjson/lasx/begin.h" */
|
|
/* begin file simdjson/lasx/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "lasx" */
|
|
#define SIMDJSON_IMPLEMENTATION lasx
|
|
/* including simdjson/lasx/base.h: #include "simdjson/lasx/base.h" */
|
|
/* begin file simdjson/lasx/base.h */
|
|
#ifndef SIMDJSON_LASX_BASE_H
|
|
#define SIMDJSON_LASX_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for LASX.
|
|
*/
|
|
namespace lasx {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_BASE_H
|
|
/* end file simdjson/lasx/base.h */
|
|
/* including simdjson/lasx/intrinsics.h: #include "simdjson/lasx/intrinsics.h" */
|
|
/* begin file simdjson/lasx/intrinsics.h */
|
|
#ifndef SIMDJSON_LASX_INTRINSICS_H
|
|
#define SIMDJSON_LASX_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <lasxintrin.h>
|
|
|
|
static_assert(sizeof(__m256i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for LoongArch ASX");
|
|
|
|
#endif // SIMDJSON_LASX_INTRINSICS_H
|
|
/* end file simdjson/lasx/intrinsics.h */
|
|
/* including simdjson/lasx/bitmanipulation.h: #include "simdjson/lasx/bitmanipulation.h" */
|
|
/* begin file simdjson/lasx/bitmanipulation.h */
|
|
#ifndef SIMDJSON_LASX_BITMANIPULATION_H
|
|
#define SIMDJSON_LASX_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/bitmask.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
return __builtin_ctzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return __builtin_clzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return __lasx_xvpickve2gr_w(__lasx_xvpcnt_d(__m256i(v4u64{input_num, 0, 0, 0})), 0);
|
|
}
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_BITMANIPULATION_H
|
|
/* end file simdjson/lasx/bitmanipulation.h */
|
|
/* including simdjson/lasx/bitmask.h: #include "simdjson/lasx/bitmask.h" */
|
|
/* begin file simdjson/lasx/bitmask.h */
|
|
#ifndef SIMDJSON_LASX_BITMASK_H
|
|
#define SIMDJSON_LASX_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/lasx/bitmask.h */
|
|
/* including simdjson/lasx/numberparsing_defs.h: #include "simdjson/lasx/numberparsing_defs.h" */
|
|
/* begin file simdjson/lasx/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_LASX_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_LASX_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace numberparsing {
|
|
|
|
// we don't have appropriate instructions, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_LASX_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/lasx/numberparsing_defs.h */
|
|
/* including simdjson/lasx/simd.h: #include "simdjson/lasx/simd.h" */
|
|
/* begin file simdjson/lasx/simd.h */
|
|
#ifndef SIMDJSON_LASX_SIMD_H
|
|
#define SIMDJSON_LASX_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m256i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m256i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m256i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m256i&() const { return this->value; }
|
|
simdjson_inline operator __m256i&() { return this->value; }
|
|
simdjson_inline operator const v32i8&() const { return (v32i8&)this->value; }
|
|
simdjson_inline operator v32i8&() { return (v32i8&)this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return __lasx_xvor_v(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return __lasx_xvand_v(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return __lasx_xvxor_v(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return __lasx_xvandn_v(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return __lasx_xvseq_b(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
__m256i hi = __lasx_xvbsll_v(*this, N);
|
|
__m256i lo = __lasx_xvbsrl_v(*this, 16 - N);
|
|
__m256i tmp = __lasx_xvbsrl_v(prev_chunk, 16 - N);
|
|
lo = __lasx_xvpermi_q(lo, tmp, 0x21);
|
|
return __lasx_xvor_v(hi, lo);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const {
|
|
__m256i mask = __lasx_xvmskltz_b(*this);
|
|
return (__lasx_xvpickve2gr_w(mask, 4) << 16) | (__lasx_xvpickve2gr_w(mask, 0));
|
|
}
|
|
simdjson_inline bool any() const {
|
|
__m256i v = __lasx_xvmsknz_b(*this);
|
|
return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
|
|
}
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) {
|
|
return __lasx_xvreplgr2vr_b(_value);
|
|
}
|
|
static simdjson_inline simd8<T> zero() { return __lasx_xvldi(0); }
|
|
static simdjson_inline simd8<T> load(const T values[32]) {
|
|
return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[32]) const {
|
|
return __lasx_xvst(*this, reinterpret_cast<__m256i *>(dst), 0);
|
|
}
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return __lasx_xvadd_b(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return __lasx_xvsub_b(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return __lasx_xvshuf_b(lookup_table, lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint32_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by haswell
|
|
// lasx do it in 4 steps, first 8 bytes and then second 8 bytes...
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // second significant 8 bits
|
|
uint8_t mask3 = uint8_t(mask >> 16); // ...
|
|
uint8_t mask4 = uint8_t(mask >> 24); // ...
|
|
// next line just loads the 64-bit values thintable_epi8[mask{1,2,3,4}]
|
|
// into a 256-bit register.
|
|
__m256i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808, int64_t(thintable_epi8[mask3]), int64_t(thintable_epi8[mask4]) + 0x0808080808080808};
|
|
// this is the version "nearly pruned"
|
|
__m256i pruned = __lasx_xvshuf_b(*this, *this, shufmask);
|
|
// we still need to put the pieces back together.
|
|
// we compute the popcount of the first words:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
int pop2 = BitsSetTable256mul2[mask2];
|
|
int pop3 = BitsSetTable256mul2[mask3];
|
|
|
|
// then load the corresponding mask
|
|
__m256i masklo = __lasx_xvldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop1 * 8);
|
|
__m256i maskhi = __lasx_xvldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop3 * 8);
|
|
__m256i compactmask = __lasx_xvpermi_q(maskhi, masklo, 0x20);
|
|
__m256i answer = __lasx_xvshuf_b(pruned, pruned, compactmask);
|
|
__lasx_xvst(answer, reinterpret_cast<uint8_t*>(output), 0);
|
|
uint64_t value3 = __lasx_xvpickve2gr_du(answer, 2);
|
|
uint64_t value4 = __lasx_xvpickve2gr_du(answer, 3);
|
|
uint64_t *pos = reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(output) + 16 - (pop1 + pop2) / 2);
|
|
pos[0] = value3;
|
|
pos[1] = value4;
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
|
|
int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
|
|
int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
|
|
) : simd8({
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return __lasx_xvmax_b(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return __lasx_xvmin_b(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return __lasx_xvslt_b(other, *this); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return __lasx_xvslt_b(*this, other); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
|
|
uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
|
|
) : simd8(__m256i(v32u8{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
})) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return __lasx_xvsadd_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return __lasx_xvssub_bu(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return __lasx_xvmax_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return __lasx_xvmin_bu(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const {
|
|
__m256i mask = __lasx_xvmskltz_b(*this);
|
|
return (0 == __lasx_xvpickve2gr_w(mask, 0)) && (0 == __lasx_xvpickve2gr_w(mask, 4));
|
|
}
|
|
simdjson_inline bool bits_not_set_anywhere() const {
|
|
__m256i v = __lasx_xvmsknz_b(*this);
|
|
return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
|
|
__m256i v = __lasx_xvmsknz_b(__lasx_xvand_v(*this, bits));
|
|
return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(__lasx_xvsrli_b(*this, N)); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(__lasx_xvslli_b(*this, N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 2, "LASX kernel should use two registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint32_t mask1 = uint32_t(mask);
|
|
uint32_t mask2 = uint32_t(mask >> 32);
|
|
__m256i zcnt = __lasx_xvpcnt_w(__m256i(v4u64{~mask, 0, 0, 0}));
|
|
uint64_t zcnt1 = __lasx_xvpickve2gr_wu(zcnt, 0);
|
|
uint64_t zcnt2 = __lasx_xvpickve2gr_wu(zcnt, 1);
|
|
// There should be a critical value which processes in scaler is faster.
|
|
if (zcnt1)
|
|
this->chunks[0].compress(mask1, output);
|
|
if (zcnt2)
|
|
this->chunks[1].compress(mask2, output + zcnt1);
|
|
return zcnt1 + zcnt2;
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
__m256i mask0 = __lasx_xvmskltz_b(this->chunks[0]);
|
|
__m256i mask1 = __lasx_xvmskltz_b(this->chunks[1]);
|
|
__m256i mask_tmp = __lasx_xvpickve_w(mask0, 4);
|
|
__m256i tmp = __lasx_xvpickve_w(mask1, 4);
|
|
mask0 = __lasx_xvinsve0_w(mask0, mask1, 1);
|
|
mask_tmp = __lasx_xvinsve0_w(mask_tmp, tmp, 1);
|
|
return __lasx_xvpickve2gr_du(__lasx_xvpackev_h(mask_tmp, mask0), 0);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_SIMD_H
|
|
/* end file simdjson/lasx/simd.h */
|
|
/* including simdjson/lasx/stringparsing_defs.h: #include "simdjson/lasx/stringparsing_defs.h" */
|
|
/* begin file simdjson/lasx/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_LASX_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_LASX_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v(src);
|
|
v.store(dst);
|
|
return {
|
|
static_cast<uint32_t>((v == '\\').to_bitmask()), // bs_bits
|
|
static_cast<uint32_t>((v == '"').to_bitmask()), // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/lasx/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/lasx/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for lasx: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for lasx */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for lasx: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for lasx */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for lasx */
|
|
/* including simdjson/generic/jsoncharutils.h for lasx: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for lasx */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for lasx */
|
|
/* including simdjson/generic/atomparsing.h for lasx: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for lasx */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for lasx */
|
|
/* including simdjson/generic/dom_parser_implementation.h for lasx: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for lasx */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for lasx */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for lasx: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for lasx */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for lasx */
|
|
/* including simdjson/generic/numberparsing.h for lasx: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for lasx */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for lasx */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for lasx: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for lasx */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for lasx */
|
|
/* end file simdjson/generic/amalgamated.h for lasx */
|
|
/* including simdjson/lasx/end.h: #include "simdjson/lasx/end.h" */
|
|
/* begin file simdjson/lasx/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "lasx" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/lasx/end.h */
|
|
|
|
#endif // SIMDJSON_LASX_H
|
|
/* end file simdjson/lasx.h */
|
|
/* including simdjson/lasx/implementation.h: #include <simdjson/lasx/implementation.h> */
|
|
/* begin file simdjson/lasx/implementation.h */
|
|
#ifndef SIMDJSON_LASX_IMPLEMENTATION_H
|
|
#define SIMDJSON_LASX_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/instruction_set.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation("lasx", "LoongArch ASX", internal::instruction_set::LASX) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_IMPLEMENTATION_H
|
|
/* end file simdjson/lasx/implementation.h */
|
|
|
|
/* including simdjson/lasx/begin.h: #include <simdjson/lasx/begin.h> */
|
|
/* begin file simdjson/lasx/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "lasx" */
|
|
#define SIMDJSON_IMPLEMENTATION lasx
|
|
/* including simdjson/lasx/base.h: #include "simdjson/lasx/base.h" */
|
|
/* begin file simdjson/lasx/base.h */
|
|
#ifndef SIMDJSON_LASX_BASE_H
|
|
#define SIMDJSON_LASX_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Implementation for LASX.
|
|
*/
|
|
namespace lasx {
|
|
|
|
class implementation;
|
|
|
|
namespace {
|
|
namespace simd {
|
|
template <typename T> struct simd8;
|
|
template <typename T> struct simd8x64;
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_BASE_H
|
|
/* end file simdjson/lasx/base.h */
|
|
/* including simdjson/lasx/intrinsics.h: #include "simdjson/lasx/intrinsics.h" */
|
|
/* begin file simdjson/lasx/intrinsics.h */
|
|
#ifndef SIMDJSON_LASX_INTRINSICS_H
|
|
#define SIMDJSON_LASX_INTRINSICS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This should be the correct header whether
|
|
// you use visual studio or other compilers.
|
|
#include <lasxintrin.h>
|
|
|
|
static_assert(sizeof(__m256i) <= simdjson::SIMDJSON_PADDING, "insufficient padding for LoongArch ASX");
|
|
|
|
#endif // SIMDJSON_LASX_INTRINSICS_H
|
|
/* end file simdjson/lasx/intrinsics.h */
|
|
/* including simdjson/lasx/bitmanipulation.h: #include "simdjson/lasx/bitmanipulation.h" */
|
|
/* begin file simdjson/lasx/bitmanipulation.h */
|
|
#ifndef SIMDJSON_LASX_BITMANIPULATION_H
|
|
#define SIMDJSON_LASX_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/bitmask.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
// We sometimes call trailing_zero on inputs that are zero,
|
|
// but the algorithms do not end up using the returned value.
|
|
// Sadly, sanitizers are not smart enough to figure it out.
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED
|
|
// This function can be used safely even if not all bytes have been
|
|
// initialized.
|
|
// See issue https://github.com/simdjson/simdjson/issues/1965
|
|
SIMDJSON_NO_SANITIZE_MEMORY
|
|
simdjson_inline int trailing_zeroes(uint64_t input_num) {
|
|
return __builtin_ctzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline uint64_t clear_lowest_bit(uint64_t input_num) {
|
|
return input_num & (input_num-1);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
return __builtin_clzll(input_num);
|
|
}
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int count_ones(uint64_t input_num) {
|
|
return __lasx_xvpickve2gr_w(__lasx_xvpcnt_d(__m256i(v4u64{input_num, 0, 0, 0})), 0);
|
|
}
|
|
|
|
simdjson_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
|
|
return __builtin_uaddll_overflow(value1, value2,
|
|
reinterpret_cast<unsigned long long *>(result));
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_BITMANIPULATION_H
|
|
/* end file simdjson/lasx/bitmanipulation.h */
|
|
/* including simdjson/lasx/bitmask.h: #include "simdjson/lasx/bitmask.h" */
|
|
/* begin file simdjson/lasx/bitmask.h */
|
|
#ifndef SIMDJSON_LASX_BITMASK_H
|
|
#define SIMDJSON_LASX_BITMASK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
//
|
|
// Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered.
|
|
//
|
|
// For example, prefix_xor(00100100) == 00011100
|
|
//
|
|
simdjson_inline uint64_t prefix_xor(uint64_t bitmask) {
|
|
bitmask ^= bitmask << 1;
|
|
bitmask ^= bitmask << 2;
|
|
bitmask ^= bitmask << 4;
|
|
bitmask ^= bitmask << 8;
|
|
bitmask ^= bitmask << 16;
|
|
bitmask ^= bitmask << 32;
|
|
return bitmask;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif
|
|
/* end file simdjson/lasx/bitmask.h */
|
|
/* including simdjson/lasx/numberparsing_defs.h: #include "simdjson/lasx/numberparsing_defs.h" */
|
|
/* begin file simdjson/lasx/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_LASX_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_LASX_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/intrinsics.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace numberparsing {
|
|
|
|
// we don't have appropriate instructions, so let us use a scalar function
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
uint64_t val;
|
|
std::memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_LASX_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/lasx/numberparsing_defs.h */
|
|
/* including simdjson/lasx/simd.h: #include "simdjson/lasx/simd.h" */
|
|
/* begin file simdjson/lasx/simd.h */
|
|
#ifndef SIMDJSON_LASX_SIMD_H
|
|
#define SIMDJSON_LASX_SIMD_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/simdprune_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace simd {
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename Child>
|
|
struct base {
|
|
__m256i value;
|
|
|
|
// Zero constructor
|
|
simdjson_inline base() : value{__m256i()} {}
|
|
|
|
// Conversion from SIMD register
|
|
simdjson_inline base(const __m256i _value) : value(_value) {}
|
|
|
|
// Conversion to SIMD register
|
|
simdjson_inline operator const __m256i&() const { return this->value; }
|
|
simdjson_inline operator __m256i&() { return this->value; }
|
|
simdjson_inline operator const v32i8&() const { return (v32i8&)this->value; }
|
|
simdjson_inline operator v32i8&() { return (v32i8&)this->value; }
|
|
|
|
// Bit operations
|
|
simdjson_inline Child operator|(const Child other) const { return __lasx_xvor_v(*this, other); }
|
|
simdjson_inline Child operator&(const Child other) const { return __lasx_xvand_v(*this, other); }
|
|
simdjson_inline Child operator^(const Child other) const { return __lasx_xvxor_v(*this, other); }
|
|
simdjson_inline Child bit_andnot(const Child other) const { return __lasx_xvandn_v(other, *this); }
|
|
simdjson_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
|
|
simdjson_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
|
|
simdjson_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
|
|
};
|
|
|
|
// Forward-declared so they can be used by splat and friends.
|
|
template<typename T>
|
|
struct simd8;
|
|
|
|
template<typename T, typename Mask=simd8<bool>>
|
|
struct base8: base<simd8<T>> {
|
|
simdjson_inline base8() : base<simd8<T>>() {}
|
|
simdjson_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
|
|
|
|
friend simdjson_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return __lasx_xvseq_b(lhs, rhs); }
|
|
|
|
static const int SIZE = sizeof(base<simd8<T>>::value);
|
|
|
|
template<int N=1>
|
|
simdjson_inline simd8<T> prev(const simd8<T> prev_chunk) const {
|
|
__m256i hi = __lasx_xvbsll_v(*this, N);
|
|
__m256i lo = __lasx_xvbsrl_v(*this, 16 - N);
|
|
__m256i tmp = __lasx_xvbsrl_v(prev_chunk, 16 - N);
|
|
lo = __lasx_xvpermi_q(lo, tmp, 0x21);
|
|
return __lasx_xvor_v(hi, lo);
|
|
}
|
|
};
|
|
|
|
// SIMD byte mask type (returned by things like eq and gt)
|
|
template<>
|
|
struct simd8<bool>: base8<bool> {
|
|
static simdjson_inline simd8<bool> splat(bool _value) { return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value))); }
|
|
|
|
simdjson_inline simd8() : base8() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8<bool>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
|
|
|
|
simdjson_inline int to_bitmask() const {
|
|
__m256i mask = __lasx_xvmskltz_b(*this);
|
|
return (__lasx_xvpickve2gr_w(mask, 4) << 16) | (__lasx_xvpickve2gr_w(mask, 0));
|
|
}
|
|
simdjson_inline bool any() const {
|
|
__m256i v = __lasx_xvmsknz_b(*this);
|
|
return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
|
|
}
|
|
simdjson_inline simd8<bool> operator~() const { return *this ^ true; }
|
|
};
|
|
|
|
template<typename T>
|
|
struct base8_numeric: base8<T> {
|
|
static simdjson_inline simd8<T> splat(T _value) {
|
|
return __lasx_xvreplgr2vr_b(_value);
|
|
}
|
|
static simdjson_inline simd8<T> zero() { return __lasx_xvldi(0); }
|
|
static simdjson_inline simd8<T> load(const T values[32]) {
|
|
return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
|
|
}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
static simdjson_inline simd8<T> repeat_16(
|
|
T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
|
|
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15
|
|
) {
|
|
return simd8<T>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
simdjson_inline base8_numeric() : base8<T>() {}
|
|
simdjson_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
|
|
|
|
// Store to array
|
|
simdjson_inline void store(T dst[32]) const {
|
|
return __lasx_xvst(*this, reinterpret_cast<__m256i *>(dst), 0);
|
|
}
|
|
|
|
// Addition/subtraction are the same for signed and unsigned
|
|
simdjson_inline simd8<T> operator+(const simd8<T> other) const { return __lasx_xvadd_b(*this, other); }
|
|
simdjson_inline simd8<T> operator-(const simd8<T> other) const { return __lasx_xvsub_b(*this, other); }
|
|
simdjson_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
|
|
simdjson_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
|
|
|
|
// Override to distinguish from bool version
|
|
simdjson_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
|
|
|
|
// Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
|
|
return __lasx_xvshuf_b(lookup_table, lookup_table, *this);
|
|
}
|
|
|
|
// Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset).
|
|
// Passing a 0 value for mask would be equivalent to writing out every byte to output.
|
|
// Only the first 16 - count_ones(mask) bytes of the result are significant but 16 bytes
|
|
// get written.
|
|
template<typename L>
|
|
simdjson_inline void compress(uint32_t mask, L * output) const {
|
|
using internal::thintable_epi8;
|
|
using internal::BitsSetTable256mul2;
|
|
using internal::pshufb_combine_table;
|
|
// this particular implementation was inspired by haswell
|
|
// lasx do it in 4 steps, first 8 bytes and then second 8 bytes...
|
|
uint8_t mask1 = uint8_t(mask); // least significant 8 bits
|
|
uint8_t mask2 = uint8_t(mask >> 8); // second significant 8 bits
|
|
uint8_t mask3 = uint8_t(mask >> 16); // ...
|
|
uint8_t mask4 = uint8_t(mask >> 24); // ...
|
|
// next line just loads the 64-bit values thintable_epi8[mask{1,2,3,4}]
|
|
// into a 256-bit register.
|
|
__m256i shufmask = {int64_t(thintable_epi8[mask1]), int64_t(thintable_epi8[mask2]) + 0x0808080808080808, int64_t(thintable_epi8[mask3]), int64_t(thintable_epi8[mask4]) + 0x0808080808080808};
|
|
// this is the version "nearly pruned"
|
|
__m256i pruned = __lasx_xvshuf_b(*this, *this, shufmask);
|
|
// we still need to put the pieces back together.
|
|
// we compute the popcount of the first words:
|
|
int pop1 = BitsSetTable256mul2[mask1];
|
|
int pop2 = BitsSetTable256mul2[mask2];
|
|
int pop3 = BitsSetTable256mul2[mask3];
|
|
|
|
// then load the corresponding mask
|
|
__m256i masklo = __lasx_xvldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop1 * 8);
|
|
__m256i maskhi = __lasx_xvldx(reinterpret_cast<void*>(reinterpret_cast<unsigned long>(pshufb_combine_table)), pop3 * 8);
|
|
__m256i compactmask = __lasx_xvpermi_q(maskhi, masklo, 0x20);
|
|
__m256i answer = __lasx_xvshuf_b(pruned, pruned, compactmask);
|
|
__lasx_xvst(answer, reinterpret_cast<uint8_t*>(output), 0);
|
|
uint64_t value3 = __lasx_xvpickve2gr_du(answer, 2);
|
|
uint64_t value4 = __lasx_xvpickve2gr_du(answer, 3);
|
|
uint64_t *pos = reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(output) + 16 - (pop1 + pop2) / 2);
|
|
pos[0] = value3;
|
|
pos[1] = value4;
|
|
}
|
|
|
|
template<typename L>
|
|
simdjson_inline simd8<L> lookup_16(
|
|
L replace0, L replace1, L replace2, L replace3,
|
|
L replace4, L replace5, L replace6, L replace7,
|
|
L replace8, L replace9, L replace10, L replace11,
|
|
L replace12, L replace13, L replace14, L replace15) const {
|
|
return lookup_16(simd8<L>::repeat_16(
|
|
replace0, replace1, replace2, replace3,
|
|
replace4, replace5, replace6, replace7,
|
|
replace8, replace9, replace10, replace11,
|
|
replace12, replace13, replace14, replace15
|
|
));
|
|
}
|
|
};
|
|
|
|
// Signed bytes
|
|
template<>
|
|
struct simd8<int8_t> : base8_numeric<int8_t> {
|
|
simdjson_inline simd8() : base8_numeric<int8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(int8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
|
|
int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
|
|
int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
|
|
) : simd8({
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
}) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<int8_t> repeat_16(
|
|
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
|
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
|
|
) {
|
|
return simd8<int8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Order-sensitive comparisons
|
|
simdjson_inline simd8<int8_t> max_val(const simd8<int8_t> other) const { return __lasx_xvmax_b(*this, other); }
|
|
simdjson_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return __lasx_xvmin_b(*this, other); }
|
|
simdjson_inline simd8<bool> operator>(const simd8<int8_t> other) const { return __lasx_xvslt_b(other, *this); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<int8_t> other) const { return __lasx_xvslt_b(*this, other); }
|
|
};
|
|
|
|
// Unsigned bytes
|
|
template<>
|
|
struct simd8<uint8_t>: base8_numeric<uint8_t> {
|
|
simdjson_inline simd8() : base8_numeric<uint8_t>() {}
|
|
simdjson_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
|
|
// Splat constructor
|
|
simdjson_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
|
|
// Array constructor
|
|
simdjson_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
|
|
// Member-by-member initialization
|
|
simdjson_inline simd8(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
|
|
uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
|
|
uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
|
|
) : simd8(__m256i(v32u8{
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v16,v17,v18,v19,v20,v21,v22,v23,
|
|
v24,v25,v26,v27,v28,v29,v30,v31
|
|
})) {}
|
|
// Repeat 16 values as many times as necessary (usually for lookup tables)
|
|
simdjson_inline static simd8<uint8_t> repeat_16(
|
|
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
|
|
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
|
|
) {
|
|
return simd8<uint8_t>(
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15,
|
|
v0, v1, v2, v3, v4, v5, v6, v7,
|
|
v8, v9, v10,v11,v12,v13,v14,v15
|
|
);
|
|
}
|
|
|
|
// Saturated math
|
|
simdjson_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return __lasx_xvsadd_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return __lasx_xvssub_bu(*this, other); }
|
|
|
|
// Order-specific operations
|
|
simdjson_inline simd8<uint8_t> max_val(const simd8<uint8_t> other) const { return __lasx_xvmax_bu(*this, other); }
|
|
simdjson_inline simd8<uint8_t> min_val(const simd8<uint8_t> other) const { return __lasx_xvmin_bu(other, *this); }
|
|
// Same as >, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> gt_bits(const simd8<uint8_t> other) const { return this->saturating_sub(other); }
|
|
// Same as <, but only guarantees true is nonzero (< guarantees true = -1)
|
|
simdjson_inline simd8<uint8_t> lt_bits(const simd8<uint8_t> other) const { return other.saturating_sub(*this); }
|
|
simdjson_inline simd8<bool> operator<=(const simd8<uint8_t> other) const { return other.max_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>=(const simd8<uint8_t> other) const { return other.min_val(*this) == other; }
|
|
simdjson_inline simd8<bool> operator>(const simd8<uint8_t> other) const { return this->gt_bits(other).any_bits_set(); }
|
|
simdjson_inline simd8<bool> operator<(const simd8<uint8_t> other) const { return this->lt_bits(other).any_bits_set(); }
|
|
|
|
// Bit-specific operations
|
|
simdjson_inline simd8<bool> bits_not_set() const { return *this == uint8_t(0); }
|
|
simdjson_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const { return (*this & bits).bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set() const { return ~this->bits_not_set(); }
|
|
simdjson_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const { return ~this->bits_not_set(bits); }
|
|
simdjson_inline bool is_ascii() const {
|
|
__m256i mask = __lasx_xvmskltz_b(*this);
|
|
return (0 == __lasx_xvpickve2gr_w(mask, 0)) && (0 == __lasx_xvpickve2gr_w(mask, 4));
|
|
}
|
|
simdjson_inline bool bits_not_set_anywhere() const {
|
|
__m256i v = __lasx_xvmsknz_b(*this);
|
|
return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
|
|
simdjson_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
|
|
__m256i v = __lasx_xvmsknz_b(__lasx_xvand_v(*this, bits));
|
|
return (0 == __lasx_xvpickve2gr_w(v, 0)) && (0 == __lasx_xvpickve2gr_w(v, 4));
|
|
}
|
|
simdjson_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const { return !bits_not_set_anywhere(bits); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shr() const { return simd8<uint8_t>(__lasx_xvsrli_b(*this, N)); }
|
|
template<int N>
|
|
simdjson_inline simd8<uint8_t> shl() const { return simd8<uint8_t>(__lasx_xvslli_b(*this, N)); }
|
|
};
|
|
|
|
template<typename T>
|
|
struct simd8x64 {
|
|
static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
|
|
static_assert(NUM_CHUNKS == 2, "LASX kernel should use two registers per 64-byte block.");
|
|
const simd8<T> chunks[NUM_CHUNKS];
|
|
|
|
simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
|
|
simd8x64<T>& operator=(const simd8<T>& other) = delete; // no assignment allowed
|
|
simd8x64() = delete; // no default constructor allowed
|
|
|
|
simdjson_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
|
|
simdjson_inline simd8x64(const T ptr[64]) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+32)} {}
|
|
|
|
simdjson_inline uint64_t compress(uint64_t mask, T * output) const {
|
|
uint32_t mask1 = uint32_t(mask);
|
|
uint32_t mask2 = uint32_t(mask >> 32);
|
|
__m256i zcnt = __lasx_xvpcnt_w(__m256i(v4u64{~mask, 0, 0, 0}));
|
|
uint64_t zcnt1 = __lasx_xvpickve2gr_wu(zcnt, 0);
|
|
uint64_t zcnt2 = __lasx_xvpickve2gr_wu(zcnt, 1);
|
|
// There should be a critical value which processes in scaler is faster.
|
|
if (zcnt1)
|
|
this->chunks[0].compress(mask1, output);
|
|
if (zcnt2)
|
|
this->chunks[1].compress(mask2, output + zcnt1);
|
|
return zcnt1 + zcnt2;
|
|
}
|
|
|
|
simdjson_inline void store(T ptr[64]) const {
|
|
this->chunks[0].store(ptr+sizeof(simd8<T>)*0);
|
|
this->chunks[1].store(ptr+sizeof(simd8<T>)*1);
|
|
}
|
|
|
|
simdjson_inline uint64_t to_bitmask() const {
|
|
__m256i mask0 = __lasx_xvmskltz_b(this->chunks[0]);
|
|
__m256i mask1 = __lasx_xvmskltz_b(this->chunks[1]);
|
|
__m256i mask_tmp = __lasx_xvpickve_w(mask0, 4);
|
|
__m256i tmp = __lasx_xvpickve_w(mask1, 4);
|
|
mask0 = __lasx_xvinsve0_w(mask0, mask1, 1);
|
|
mask_tmp = __lasx_xvinsve0_w(mask_tmp, tmp, 1);
|
|
return __lasx_xvpickve2gr_du(__lasx_xvpackev_h(mask_tmp, mask0), 0);
|
|
}
|
|
|
|
simdjson_inline simd8<T> reduce_or() const {
|
|
return this->chunks[0] | this->chunks[1];
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == mask,
|
|
this->chunks[1] == mask
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
|
|
return simd8x64<bool>(
|
|
this->chunks[0] == other.chunks[0],
|
|
this->chunks[1] == other.chunks[1]
|
|
).to_bitmask();
|
|
}
|
|
|
|
simdjson_inline uint64_t lteq(const T m) const {
|
|
const simd8<T> mask = simd8<T>::splat(m);
|
|
return simd8x64<bool>(
|
|
this->chunks[0] <= mask,
|
|
this->chunks[1] <= mask
|
|
).to_bitmask();
|
|
}
|
|
}; // struct simd8x64<T>
|
|
|
|
} // namespace simd
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_SIMD_H
|
|
/* end file simdjson/lasx/simd.h */
|
|
/* including simdjson/lasx/stringparsing_defs.h: #include "simdjson/lasx/stringparsing_defs.h" */
|
|
/* begin file simdjson/lasx/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_LASX_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_LASX_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/simd.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/bitmanipulation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 32;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return ((bs_bits - 1) & quote_bits) != 0; }
|
|
simdjson_inline bool has_backslash() { return bs_bits != 0; }
|
|
simdjson_inline int quote_index() { return trailing_zeroes(quote_bits); }
|
|
simdjson_inline int backslash_index() { return trailing_zeroes(bs_bits); }
|
|
|
|
uint32_t bs_bits;
|
|
uint32_t quote_bits;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// this can read up to 31 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(SIMDJSON_PADDING >= (BYTES_PROCESSED - 1), "backslash and quote finder must process fewer than SIMDJSON_PADDING bytes");
|
|
simd8<uint8_t> v(src);
|
|
v.store(dst);
|
|
return {
|
|
static_cast<uint32_t>((v == '\\').to_bitmask()), // bs_bits
|
|
static_cast<uint32_t>((v == '"').to_bitmask()), // quote_bits
|
|
};
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_LASX_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/lasx/stringparsing_defs.h */
|
|
|
|
#define SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT 1
|
|
/* end file simdjson/lasx/begin.h */
|
|
/* including generic/amalgamated.h for lasx: #include <generic/amalgamated.h> */
|
|
/* begin file generic/amalgamated.h for lasx */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_SRC_GENERIC_DEPENDENCIES_H)
|
|
#error generic/dependencies.h must be included before generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including generic/base.h for lasx: #include <generic/base.h> */
|
|
/* begin file generic/base.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
struct json_character_block;
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_BASE_H
|
|
/* end file generic/base.h for lasx */
|
|
/* including generic/dom_parser_implementation.h for lasx: #include <generic/dom_parser_implementation.h> */
|
|
/* begin file generic/dom_parser_implementation.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// Interface a dom parser implementation must fulfill
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3);
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input);
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file generic/dom_parser_implementation.h for lasx */
|
|
/* including generic/json_character_block.h for lasx: #include <generic/json_character_block.h> */
|
|
/* begin file generic/json_character_block.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
|
|
struct json_character_block {
|
|
static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
|
|
|
|
simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
|
|
simdjson_inline uint64_t op() const noexcept { return _op; }
|
|
simdjson_inline uint64_t scalar() const noexcept { return ~(op() | whitespace()); }
|
|
|
|
uint64_t _whitespace;
|
|
uint64_t _op;
|
|
};
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_JSON_CHARACTER_BLOCK_H
|
|
/* end file generic/json_character_block.h for lasx */
|
|
/* end file generic/amalgamated.h for lasx */
|
|
/* including generic/stage1/amalgamated.h for lasx: #include <generic/stage1/amalgamated.h> */
|
|
/* begin file generic/stage1/amalgamated.h for lasx */
|
|
// Stuff other things depend on
|
|
/* including generic/stage1/base.h for lasx: #include <generic/stage1/base.h> */
|
|
/* begin file generic/stage1/base.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer;
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader;
|
|
struct json_block;
|
|
class json_minifier;
|
|
class json_scanner;
|
|
struct json_string_block;
|
|
class json_string_scanner;
|
|
class json_structural_indexer;
|
|
|
|
} // namespace stage1
|
|
|
|
namespace utf8_validation {
|
|
struct utf8_checker;
|
|
} // namespace utf8_validation
|
|
|
|
using utf8_validation::utf8_checker;
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BASE_H
|
|
/* end file generic/stage1/base.h for lasx */
|
|
/* including generic/stage1/buf_block_reader.h for lasx: #include <generic/stage1/buf_block_reader.h> */
|
|
/* begin file generic/stage1/buf_block_reader.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
// Walks through a buffer in block-sized increments, loading the last part with spaces
|
|
template<size_t STEP_SIZE>
|
|
struct buf_block_reader {
|
|
public:
|
|
simdjson_inline buf_block_reader(const uint8_t *_buf, size_t _len);
|
|
simdjson_inline size_t block_index();
|
|
simdjson_inline bool has_full_block() const;
|
|
simdjson_inline const uint8_t *full_block() const;
|
|
/**
|
|
* Get the last block, padded with spaces.
|
|
*
|
|
* There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
|
|
* function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
|
|
* will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
|
|
*
|
|
* @return the number of effective characters in the last block.
|
|
*/
|
|
simdjson_inline size_t get_remainder(uint8_t *dst) const;
|
|
simdjson_inline void advance();
|
|
private:
|
|
const uint8_t *buf;
|
|
const size_t len;
|
|
const size_t lenminusstep;
|
|
size_t idx;
|
|
};
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text_64(const uint8_t *text) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
// Routines to print masks and text for debugging bitmask operations
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] < ' ') { buf[i] = '_'; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_input_text(const simd8x64<uint8_t>& in, uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
in.store(reinterpret_cast<uint8_t*>(buf));
|
|
for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
|
|
if (buf[i] <= ' ') { buf[i] = '_'; }
|
|
if (!(mask & (size_t(1) << i))) { buf[i] = ' '; }
|
|
}
|
|
buf[sizeof(simd8x64<uint8_t>)] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
simdjson_unused static char * format_mask(uint64_t mask) {
|
|
static char buf[sizeof(simd8x64<uint8_t>) + 1];
|
|
for (size_t i=0; i<64; i++) {
|
|
buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
|
|
}
|
|
buf[64] = '\0';
|
|
return buf;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
|
|
return idx < lenminusstep;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
|
|
return &buf[idx];
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
|
|
if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
|
|
std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
|
|
std::memcpy(dst, buf + idx, len - idx);
|
|
return len - idx;
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void buf_block_reader<STEP_SIZE>::advance() {
|
|
idx += STEP_SIZE;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_BUF_BLOCK_READER_H
|
|
/* end file generic/stage1/buf_block_reader.h for lasx */
|
|
/* including generic/stage1/json_escape_scanner.h for lasx: #include <generic/stage1/json_escape_scanner.h> */
|
|
/* begin file generic/stage1/json_escape_scanner.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_ESCAPE_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Scans for escape characters in JSON, taking care with multiple backslashes (\\n vs. \n).
|
|
*/
|
|
struct json_escape_scanner {
|
|
/** The actual escape characters (the backslashes themselves). */
|
|
uint64_t next_is_escaped = 0ULL;
|
|
|
|
struct escaped_and_escape {
|
|
/**
|
|
* Mask of escaped characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 0100100010100101000
|
|
* n \ \ n \ \
|
|
* ```
|
|
*/
|
|
uint64_t escaped;
|
|
/**
|
|
* Mask of escape characters.
|
|
*
|
|
* ```
|
|
* \n \\n \\\n \\\\n \
|
|
* 1001000101001010001
|
|
* \ \ \ \ \ \ \
|
|
* ```
|
|
*/
|
|
uint64_t escape;
|
|
};
|
|
|
|
/**
|
|
* Get a mask of both escape and escaped characters (the characters following a backslash).
|
|
*
|
|
* @param potential_escape A mask of the character that can escape others (but could be
|
|
* escaped itself). e.g. block.eq('\\')
|
|
*/
|
|
simdjson_really_inline escaped_and_escape next(uint64_t backslash) noexcept {
|
|
|
|
#if !SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
if (!backslash) { return {next_escaped_without_backslashes(), 0}; }
|
|
#endif
|
|
|
|
// | | Mask (shows characters instead of 1's) | Depth | Instructions |
|
|
// |--------------------------------|----------------------------------------|-------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` | | |
|
|
// | | ` even odd even odd odd` | | |
|
|
// | potential_escape | ` \ \\\ \\\ \\\\ \\\\ \\\` | 1 | 1 (backslash & ~first_is_escaped)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 5 | 5 (next_escape_and_terminal_code())
|
|
// | escaped | `\ \ n \ n \ \ \ \ \ ` X | 6 | 7 (escape_and_terminal_code ^ (potential_escape | first_is_escaped))
|
|
// | escape | ` \ \ \ \ \ \ \ \ \ \` | 6 | 8 (escape_and_terminal_code & backslash)
|
|
// | first_is_escaped | `\ ` | 7 (*) | 9 (escape >> 63) ()
|
|
// (*) this is not needed until the next iteration
|
|
uint64_t escape_and_terminal_code = next_escape_and_terminal_code(backslash & ~this->next_is_escaped);
|
|
uint64_t escaped = escape_and_terminal_code ^ (backslash | this->next_is_escaped);
|
|
uint64_t escape = escape_and_terminal_code & backslash;
|
|
this->next_is_escaped = escape >> 63;
|
|
return {escaped, escape};
|
|
}
|
|
|
|
private:
|
|
static constexpr const uint64_t ODD_BITS = 0xAAAAAAAAAAAAAAAAULL;
|
|
|
|
simdjson_really_inline uint64_t next_escaped_without_backslashes() noexcept {
|
|
uint64_t escaped = this->next_is_escaped;
|
|
this->next_is_escaped = 0;
|
|
return escaped;
|
|
}
|
|
|
|
/**
|
|
* Returns a mask of the next escape characters (masking out escaped backslashes), along with
|
|
* any non-backslash escape codes.
|
|
*
|
|
* \n \\n \\\n \\\\n returns:
|
|
* \n \ \ \n \ \
|
|
* 11 100 1011 10100
|
|
*
|
|
* You are expected to mask out the first bit yourself if the previous block had a trailing
|
|
* escape.
|
|
*
|
|
* & the result with potential_escape to get just the escape characters.
|
|
* ^ the result with (potential_escape | first_is_escaped) to get escaped characters.
|
|
*/
|
|
static simdjson_really_inline uint64_t next_escape_and_terminal_code(uint64_t potential_escape) noexcept {
|
|
// If we were to just shift and mask out any odd bits, we'd actually get a *half* right answer:
|
|
// any even-aligned backslash runs would be correct! Odd-aligned backslash runs would be
|
|
// inverted (\\\ would be 010 instead of 101).
|
|
//
|
|
// ```
|
|
// string: | ____\\\\_\\\\_____ |
|
|
// maybe_escaped | ODD | \ \ \ \ |
|
|
// even-aligned ^^^ ^^^^ odd-aligned
|
|
// ```
|
|
//
|
|
// Taking that into account, our basic strategy is:
|
|
//
|
|
// 1. Use subtraction to produce a mask with 1's for even-aligned runs and 0's for
|
|
// odd-aligned runs.
|
|
// 2. XOR all odd bits, which masks out the odd bits in even-aligned runs, and brings IN the
|
|
// odd bits in odd-aligned runs.
|
|
// 3. & with backslash to clean up any stray bits.
|
|
// runs are set to 0, and then XORing with "odd":
|
|
//
|
|
// | | Mask (shows characters instead of 1's) | Instructions |
|
|
// |--------------------------------|----------------------------------------|---------------------|
|
|
// | string | `\\n_\\\n___\\\n___\\\\___\\\\__\\\` |
|
|
// | | ` even odd even odd odd` |
|
|
// | maybe_escaped | ` n \\n \\n \\\_ \\\_ \\` X | 1 (potential_escape << 1)
|
|
// | maybe_escaped_and_odd | ` \n_ \\n _ \\\n_ _ \\\__ _\\\_ \\\` | 1 (maybe_escaped | odd)
|
|
// | even_series_codes_and_odd | ` n_\\\ _ n_ _\\\\ _ _ ` | 1 (maybe_escaped_and_odd - potential_escape)
|
|
// | escape_and_terminal_code | ` \n \ \n \ \n \ \ \ \ \ \` | 1 (^ odd)
|
|
//
|
|
|
|
// Escaped characters are characters following an escape.
|
|
uint64_t maybe_escaped = potential_escape << 1;
|
|
|
|
// To distinguish odd from even escape sequences, therefore, we turn on any *starting*
|
|
// escapes that are on an odd byte. (We actually bring in all odd bits, for speed.)
|
|
// - Odd runs of backslashes are 0000, and the code at the end ("n" in \n or \\n) is 1.
|
|
// - Odd runs of backslashes are 1111, and the code at the end ("n" in \n or \\n) is 0.
|
|
// - All other odd bytes are 1, and even bytes are 0.
|
|
uint64_t maybe_escaped_and_odd_bits = maybe_escaped | ODD_BITS;
|
|
uint64_t even_series_codes_and_odd_bits = maybe_escaped_and_odd_bits - potential_escape;
|
|
|
|
// Now we flip all odd bytes back with xor. This:
|
|
// - Makes odd runs of backslashes go from 0000 to 1010
|
|
// - Makes even runs of backslashes go from 1111 to 1010
|
|
// - Sets actually-escaped codes to 1 (the n in \n and \\n: \n = 11, \\n = 100)
|
|
// - Resets all other bytes to 0
|
|
return even_series_codes_and_odd_bits ^ ODD_BITS;
|
|
}
|
|
};
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_escape_scanner.h for lasx */
|
|
/* including generic/stage1/json_string_scanner.h for lasx: #include <generic/stage1/json_string_scanner.h> */
|
|
/* begin file generic/stage1/json_string_scanner.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_escape_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
struct json_string_block {
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_really_inline json_string_block(uint64_t escaped, uint64_t quote, uint64_t in_string) :
|
|
_escaped(escaped), _quote(quote), _in_string(in_string) {}
|
|
|
|
// Escaped characters (characters following an escape() character)
|
|
simdjson_really_inline uint64_t escaped() const { return _escaped; }
|
|
// Real (non-backslashed) quotes
|
|
simdjson_really_inline uint64_t quote() const { return _quote; }
|
|
// Only characters inside the string (not including the quotes)
|
|
simdjson_really_inline uint64_t string_content() const { return _in_string & ~_quote; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_inside_string(uint64_t mask) const { return mask & _in_string; }
|
|
// Return a mask of whether the given characters are inside a string (only works on non-quotes)
|
|
simdjson_really_inline uint64_t non_quote_outside_string(uint64_t mask) const { return mask & ~_in_string; }
|
|
// Tail of string (everything except the start quote)
|
|
simdjson_really_inline uint64_t string_tail() const { return _in_string ^ _quote; }
|
|
|
|
// escaped characters (backslashed--does not include the hex characters after \u)
|
|
uint64_t _escaped;
|
|
// real quotes (non-escaped ones)
|
|
uint64_t _quote;
|
|
// string characters (includes start quote but not end quote)
|
|
uint64_t _in_string;
|
|
};
|
|
|
|
// Scans blocks for string characters, storing the state necessary to do so
|
|
class json_string_scanner {
|
|
public:
|
|
simdjson_really_inline json_string_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_really_inline error_code finish();
|
|
|
|
private:
|
|
// Scans for escape characters
|
|
json_escape_scanner escape_scanner{};
|
|
// Whether the last iteration was still inside a string (all 1's = true, all 0's = false).
|
|
uint64_t prev_in_string = 0ULL;
|
|
};
|
|
|
|
//
|
|
// Return a mask of all string characters plus end quotes.
|
|
//
|
|
// prev_escaped is overflow saying whether the next character is escaped.
|
|
// prev_in_string is overflow saying whether we're still in a string.
|
|
//
|
|
// Backslash sequences outside of quotes will be detected in stage 2.
|
|
//
|
|
simdjson_really_inline json_string_block json_string_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
const uint64_t backslash = in.eq('\\');
|
|
const uint64_t escaped = escape_scanner.next(backslash).escaped;
|
|
const uint64_t quote = in.eq('"') & ~escaped;
|
|
|
|
//
|
|
// prefix_xor flips on bits inside the string (and flips off the end quote).
|
|
//
|
|
// Then we xor with prev_in_string: if we were in a string already, its effect is flipped
|
|
// (characters inside strings are outside, and characters outside strings are inside).
|
|
//
|
|
const uint64_t in_string = prefix_xor(quote) ^ prev_in_string;
|
|
|
|
//
|
|
// Check if we're still in a string at the end of the box so the next block will know
|
|
//
|
|
prev_in_string = uint64_t(static_cast<int64_t>(in_string) >> 63);
|
|
|
|
// Use ^ to turn the beginning quote off, and the end quote on.
|
|
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_string_block(escaped, quote, in_string);
|
|
}
|
|
|
|
simdjson_really_inline error_code json_string_scanner::finish() {
|
|
if (prev_in_string) {
|
|
return UNCLOSED_STRING;
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRING_SCANNER_H
|
|
/* end file generic/stage1/json_string_scanner.h for lasx */
|
|
/* including generic/stage1/utf8_lookup4_algorithm.h for lasx: #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* begin file generic/stage1/utf8_lookup4_algorithm.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace utf8_validation {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
|
|
// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
|
|
// Bit 1 = Too Long (ASCII followed by continuation)
|
|
// Bit 2 = Overlong 3-byte
|
|
// Bit 4 = Surrogate
|
|
// Bit 5 = Overlong 2-byte
|
|
// Bit 7 = Two Continuations
|
|
constexpr const uint8_t TOO_SHORT = 1<<0; // 11______ 0_______
|
|
// 11______ 11______
|
|
constexpr const uint8_t TOO_LONG = 1<<1; // 0_______ 10______
|
|
constexpr const uint8_t OVERLONG_3 = 1<<2; // 11100000 100_____
|
|
constexpr const uint8_t SURROGATE = 1<<4; // 11101101 101_____
|
|
constexpr const uint8_t OVERLONG_2 = 1<<5; // 1100000_ 10______
|
|
constexpr const uint8_t TWO_CONTS = 1<<7; // 10______ 10______
|
|
constexpr const uint8_t TOO_LARGE = 1<<3; // 11110100 1001____
|
|
// 11110100 101_____
|
|
// 11110101 1001____
|
|
// 11110101 101_____
|
|
// 1111011_ 1001____
|
|
// 1111011_ 101_____
|
|
// 11111___ 1001____
|
|
// 11111___ 101_____
|
|
constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
|
|
// 11110101 1000____
|
|
// 1111011_ 1000____
|
|
// 11111___ 1000____
|
|
constexpr const uint8_t OVERLONG_4 = 1<<6; // 11110000 1000____
|
|
|
|
const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
|
|
// 0_______ ________ <ASCII in byte 1>
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
|
|
// 10______ ________ <continuation in byte 1>
|
|
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
|
|
// 1100____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_2,
|
|
// 1101____ ________ <two byte lead in byte 1>
|
|
TOO_SHORT,
|
|
// 1110____ ________ <three byte lead in byte 1>
|
|
TOO_SHORT | OVERLONG_3 | SURROGATE,
|
|
// 1111____ ________ <four+ byte lead in byte 1>
|
|
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
|
|
);
|
|
constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
|
|
const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
|
|
// ____0000 ________
|
|
CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
|
|
// ____0001 ________
|
|
CARRY | OVERLONG_2,
|
|
// ____001_ ________
|
|
CARRY,
|
|
CARRY,
|
|
|
|
// ____0100 ________
|
|
CARRY | TOO_LARGE,
|
|
// ____0101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____011_ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
|
|
// ____1___ ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
// ____1101 ________
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000,
|
|
CARRY | TOO_LARGE | TOO_LARGE_1000
|
|
);
|
|
const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
|
|
// ________ 0_______ <ASCII in byte 2>
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
|
|
|
|
// ________ 1000____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
|
|
// ________ 1001____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
|
|
// ________ 101_____
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
|
|
|
|
// ________ 11______
|
|
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
|
|
);
|
|
return (byte_1_high & byte_1_low & byte_2_high);
|
|
}
|
|
simdjson_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
|
|
const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
|
|
simd8<uint8_t> prev2 = input.prev<2>(prev_input);
|
|
simd8<uint8_t> prev3 = input.prev<3>(prev_input);
|
|
simd8<uint8_t> must23 = must_be_2_3_continuation(prev2, prev3);
|
|
simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
|
|
return must23_80 ^ sc;
|
|
}
|
|
|
|
//
|
|
// Return nonzero if there are incomplete multibyte characters at the end of the block:
|
|
// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
|
|
//
|
|
simdjson_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
|
|
// If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
|
|
// ... 1111____ 111_____ 11______
|
|
#if SIMDJSON_IMPLEMENTATION_ICELAKE
|
|
static const uint8_t max_array[64] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#else
|
|
static const uint8_t max_array[32] = {
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 255, 255, 255,
|
|
255, 255, 255, 255, 255, 0xf0u-1, 0xe0u-1, 0xc0u-1
|
|
};
|
|
#endif
|
|
const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
|
|
return input.gt_bits(max_value);
|
|
}
|
|
|
|
struct utf8_checker {
|
|
// If this is nonzero, there has been a UTF-8 error.
|
|
simd8<uint8_t> error;
|
|
// The last input we received
|
|
simd8<uint8_t> prev_input_block;
|
|
// Whether the last input we received was incomplete (used for ASCII fast path)
|
|
simd8<uint8_t> prev_incomplete;
|
|
|
|
//
|
|
// Check whether the current bytes are valid UTF-8.
|
|
//
|
|
simdjson_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
|
|
// Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
|
|
// (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
|
|
simd8<uint8_t> prev1 = input.prev<1>(prev_input);
|
|
simd8<uint8_t> sc = check_special_cases(input, prev1);
|
|
this->error |= check_multibyte_lengths(input, prev_input, sc);
|
|
}
|
|
|
|
// The only problem that can happen at EOF is that a multibyte character is too short
|
|
// or a byte value too large in the last bytes: check_special_cases only checks for bytes
|
|
// too large in the first of two bytes.
|
|
simdjson_inline void check_eof() {
|
|
// If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
|
|
// possibly finish them.
|
|
this->error |= this->prev_incomplete;
|
|
}
|
|
|
|
simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
|
|
if(simdjson_likely(is_ascii(input))) {
|
|
this->error |= this->prev_incomplete;
|
|
} else {
|
|
// you might think that a for-loop would work, but under Visual Studio, it is not good enough.
|
|
static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 1)
|
|
||(simd8x64<uint8_t>::NUM_CHUNKS == 2)
|
|
|| (simd8x64<uint8_t>::NUM_CHUNKS == 4),
|
|
"We support one, two or four chunks per 64-byte block.");
|
|
SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 1) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
} else SIMDJSON_IF_CONSTEXPR (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
|
|
this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
|
|
this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
|
|
this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
|
|
this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
|
|
}
|
|
this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
|
|
this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
|
|
}
|
|
}
|
|
// do not forget to call check_eof!
|
|
simdjson_inline error_code errors() {
|
|
return this->error.any_bits_set_anywhere() ? error_code::UTF8_ERROR : error_code::SUCCESS;
|
|
}
|
|
|
|
}; // struct utf8_checker
|
|
} // namespace utf8_validation
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_LOOKUP4_ALGORITHM_H
|
|
/* end file generic/stage1/utf8_lookup4_algorithm.h for lasx */
|
|
/* including generic/stage1/json_scanner.h for lasx: #include <generic/stage1/json_scanner.h> */
|
|
/* begin file generic/stage1/json_scanner.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/json_character_block.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* A block of scanned json, with information on operators and scalars.
|
|
*
|
|
* We seek to identify pseudo-structural characters. Anything that is inside
|
|
* a string must be omitted (hence & ~_string.string_tail()).
|
|
* Otherwise, pseudo-structural characters come in two forms.
|
|
* 1. We have the structural characters ([,],{,},:, comma). The
|
|
* term 'structural character' is from the JSON RFC.
|
|
* 2. We have the 'scalar pseudo-structural characters'.
|
|
* Scalars are quotes, and any character except structural characters and white space.
|
|
*
|
|
* To identify the scalar pseudo-structural characters, we must look at what comes
|
|
* before them: it must be a space, a quote or a structural characters.
|
|
* Starting with simdjson v0.3, we identify them by
|
|
* negation: we identify everything that is followed by a non-quote scalar,
|
|
* and we negate that. Whatever remains must be a 'scalar pseudo-structural character'.
|
|
*/
|
|
struct json_block {
|
|
public:
|
|
// We spell out the constructors in the hope of resolving inlining issues with Visual Studio 2017
|
|
simdjson_inline json_block(json_string_block&& string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(std::move(string)), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
simdjson_inline json_block(json_string_block string, json_character_block characters, uint64_t follows_potential_nonquote_scalar) :
|
|
_string(string), _characters(characters), _follows_potential_nonquote_scalar(follows_potential_nonquote_scalar) {}
|
|
|
|
/**
|
|
* The start of structurals.
|
|
* In simdjson prior to v0.3, these were called the pseudo-structural characters.
|
|
**/
|
|
simdjson_inline uint64_t structural_start() const noexcept { return potential_structural_start() & ~_string.string_tail(); }
|
|
/** All JSON whitespace (i.e. not in a string) */
|
|
simdjson_inline uint64_t whitespace() const noexcept { return non_quote_outside_string(_characters.whitespace()); }
|
|
|
|
// Helpers
|
|
|
|
/** Whether the given characters are inside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_inside_string(uint64_t mask) const noexcept { return _string.non_quote_inside_string(mask); }
|
|
/** Whether the given characters are outside a string (only works on non-quotes) */
|
|
simdjson_inline uint64_t non_quote_outside_string(uint64_t mask) const noexcept { return _string.non_quote_outside_string(mask); }
|
|
|
|
// string and escape characters
|
|
json_string_block _string;
|
|
// whitespace, structural characters ('operators'), scalars
|
|
json_character_block _characters;
|
|
// whether the previous character was a scalar
|
|
uint64_t _follows_potential_nonquote_scalar;
|
|
private:
|
|
// Potential structurals (i.e. disregarding strings)
|
|
|
|
/**
|
|
* structural elements ([,],{,},:, comma) plus scalar starts like 123, true and "abc".
|
|
* They may reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_structural_start() const noexcept { return _characters.op() | potential_scalar_start(); }
|
|
/**
|
|
* The start of non-operator runs, like 123, true and "abc".
|
|
* It main reside inside a string.
|
|
**/
|
|
simdjson_inline uint64_t potential_scalar_start() const noexcept {
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// Whenever it is preceded by something that is not a structural element ({,},[,],:, ") nor a white-space
|
|
// then we know that it is irrelevant structurally.
|
|
return _characters.scalar() & ~follows_potential_scalar();
|
|
}
|
|
/**
|
|
* Whether the given character is immediately after a non-operator like 123, true.
|
|
* The characters following a quote are not included.
|
|
*/
|
|
simdjson_inline uint64_t follows_potential_scalar() const noexcept {
|
|
// _follows_potential_nonquote_scalar: is defined as marking any character that follows a character
|
|
// that is not a structural element ({,},[,],:, comma) nor a quote (") and that is not a
|
|
// white space.
|
|
// It is understood that within quoted region, anything at all could be marked (irrelevant).
|
|
return _follows_potential_nonquote_scalar;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Scans JSON for important bits: structural characters or 'operators', strings, and scalars.
|
|
*
|
|
* The scanner starts by calculating two distinct things:
|
|
* - string characters (taking \" into account)
|
|
* - structural characters or 'operators' ([]{},:, comma)
|
|
* and scalars (runs of non-operators like 123, true and "abc")
|
|
*
|
|
* To minimize data dependency (a key component of the scanner's speed), it finds these in parallel:
|
|
* in particular, the operator/scalar bit will find plenty of things that are actually part of
|
|
* strings. When we're done, json_block will fuse the two together by masking out tokens that are
|
|
* part of a string.
|
|
*/
|
|
class json_scanner {
|
|
public:
|
|
json_scanner() = default;
|
|
simdjson_inline json_block next(const simd::simd8x64<uint8_t>& in);
|
|
// Returns either UNCLOSED_STRING or SUCCESS
|
|
simdjson_inline error_code finish();
|
|
|
|
private:
|
|
// Whether the last character of the previous iteration is part of a scalar token
|
|
// (anything except whitespace or a structural character/'operator').
|
|
uint64_t prev_scalar = 0ULL;
|
|
json_string_scanner string_scanner{};
|
|
};
|
|
|
|
|
|
//
|
|
// Check if the current character immediately follows a matching character.
|
|
//
|
|
// For example, this checks for quotes with backslashes in front of them:
|
|
//
|
|
// const uint64_t backslashed_quote = in.eq('"') & immediately_follows(in.eq('\'), prev_backslash);
|
|
//
|
|
simdjson_inline uint64_t follows(const uint64_t match, uint64_t &overflow) {
|
|
const uint64_t result = match << 1 | overflow;
|
|
overflow = match >> 63;
|
|
return result;
|
|
}
|
|
|
|
simdjson_inline json_block json_scanner::next(const simd::simd8x64<uint8_t>& in) {
|
|
json_string_block strings = string_scanner.next(in);
|
|
// identifies the white-space and the structural characters
|
|
json_character_block characters = json_character_block::classify(in);
|
|
// The term "scalar" refers to anything except structural characters and white space
|
|
// (so letters, numbers, quotes).
|
|
// We want follows_scalar to mark anything that follows a non-quote scalar (so letters and numbers).
|
|
//
|
|
// A terminal quote should either be followed by a structural character (comma, brace, bracket, colon)
|
|
// or nothing. However, we still want ' "a string"true ' to mark the 't' of 'true' as a potential
|
|
// pseudo-structural character just like we would if we had ' "a string" true '; otherwise we
|
|
// may need to add an extra check when parsing strings.
|
|
//
|
|
// Performance: there are many ways to skin this cat.
|
|
const uint64_t nonquote_scalar = characters.scalar() & ~strings.quote();
|
|
uint64_t follows_nonquote_scalar = follows(nonquote_scalar, prev_scalar);
|
|
// We are returning a function-local object so either we get a move constructor
|
|
// or we get copy elision.
|
|
return json_block(
|
|
strings,// strings is a function-local object so either it moves or the copy is elided.
|
|
characters,
|
|
follows_nonquote_scalar
|
|
);
|
|
}
|
|
|
|
simdjson_inline error_code json_scanner::finish() {
|
|
return string_scanner.finish();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_SCANNER_H
|
|
/* end file generic/stage1/json_scanner.h for lasx */
|
|
|
|
// All other declarations
|
|
/* including generic/stage1/find_next_document_index.h for lasx: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for lasx */
|
|
/* including generic/stage1/json_minifier.h for lasx: #include <generic/stage1/json_minifier.h> */
|
|
/* begin file generic/stage1/json_minifier.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class json_minifier {
|
|
public:
|
|
template<size_t STEP_SIZE>
|
|
static error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_minifier(uint8_t *_dst)
|
|
: dst{_dst}
|
|
{}
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block_buf, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block);
|
|
simdjson_inline error_code finish(uint8_t *dst_start, size_t &dst_len);
|
|
json_scanner scanner{};
|
|
uint8_t *dst;
|
|
};
|
|
|
|
simdjson_inline void json_minifier::next(const simd::simd8x64<uint8_t>& in, const json_block& block) {
|
|
uint64_t mask = block.whitespace();
|
|
dst += in.compress(mask, dst);
|
|
}
|
|
|
|
simdjson_inline error_code json_minifier::finish(uint8_t *dst_start, size_t &dst_len) {
|
|
error_code error = scanner.finish();
|
|
if (error) { dst_len = 0; return error; }
|
|
dst_len = dst - dst_start;
|
|
return SUCCESS;
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<128>(const uint8_t *block_buf, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
simd::simd8x64<uint8_t> in_2(block_buf+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1);
|
|
this->next(in_2, block_2);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_minifier::step<64>(const uint8_t *block_buf, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block_buf);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(block_buf, block_1);
|
|
reader.advance();
|
|
}
|
|
|
|
template<size_t STEP_SIZE>
|
|
error_code json_minifier::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) noexcept {
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_minifier minifier(dst);
|
|
|
|
// Index the first n-1 blocks
|
|
while (reader.has_full_block()) {
|
|
minifier.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
|
|
// Index the last (remainder) block, padded with spaces
|
|
uint8_t block[STEP_SIZE];
|
|
size_t remaining_bytes = reader.get_remainder(block);
|
|
if (remaining_bytes > 0) {
|
|
// We do not want to write directly to the output stream. Rather, we write
|
|
// to a local buffer (for safety).
|
|
uint8_t out_block[STEP_SIZE];
|
|
uint8_t * const guarded_dst{minifier.dst};
|
|
minifier.dst = out_block;
|
|
minifier.step<STEP_SIZE>(block, reader);
|
|
size_t to_write = minifier.dst - out_block;
|
|
// In some cases, we could be enticed to consider the padded spaces
|
|
// as part of the string. This is fine as long as we do not write more
|
|
// than we consumed.
|
|
if(to_write > remaining_bytes) { to_write = remaining_bytes; }
|
|
memcpy(guarded_dst, out_block, to_write);
|
|
minifier.dst = guarded_dst + to_write;
|
|
}
|
|
return minifier.finish(dst, dst_len);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_MINIFIER_H
|
|
/* end file generic/stage1/json_minifier.h for lasx */
|
|
/* including generic/stage1/json_structural_indexer.h for lasx: #include <generic/stage1/json_structural_indexer.h> */
|
|
/* begin file generic/stage1/json_structural_indexer.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_string_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_scanner.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/json_minifier.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/find_next_document_index.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses in stage1
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
// We assume the file in which it is included already includes
|
|
// "simdjson/stage1.h" (this simplifies amalgation)
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class bit_indexer {
|
|
public:
|
|
uint32_t *tail;
|
|
|
|
simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
/**
|
|
* ARM lacks a fast trailing zero instruction, but it has a fast
|
|
* bit reversal instruction and a fast leading zero instruction.
|
|
* Thus it may be profitable to reverse the bits (once) and then
|
|
* to rely on a sequence of instructions that call the leading
|
|
* zero instruction.
|
|
*
|
|
* Performance notes:
|
|
* The chosen routine is not optimal in terms of data dependency
|
|
* since zero_leading_bit might require two instructions. However,
|
|
* it tends to minimize the total number of instructions which is
|
|
* beneficial.
|
|
*/
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
|
|
int lz = leading_zeroes(rev_bits);
|
|
this->tail[i] = static_cast<uint32_t>(idx) + lz;
|
|
rev_bits = zero_leading_bit(rev_bits, lz);
|
|
}
|
|
#else
|
|
/**
|
|
* Under recent x64 systems, we often have both a fast trailing zero
|
|
* instruction and a fast 'clear-lower-bit' instruction so the following
|
|
* algorithm can be competitive.
|
|
*/
|
|
|
|
simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
|
|
this->tail[i] = idx + trailing_zeroes(bits);
|
|
bits = clear_lowest_bit(bits);
|
|
}
|
|
#endif // SIMDJSON_PREFER_REVERSE_BITS
|
|
|
|
template <int START, int N>
|
|
simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
|
|
write_index(idx, bits, START);
|
|
SIMDJSON_IF_CONSTEXPR (N > 1) {
|
|
write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
|
|
}
|
|
return START+N;
|
|
}
|
|
|
|
template <int START, int END, int STEP>
|
|
simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
|
|
write_indexes<START, STEP>(idx, bits);
|
|
SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) {
|
|
if (simdjson_unlikely((START+STEP) < cnt)) {
|
|
write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
|
|
}
|
|
}
|
|
return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
|
|
}
|
|
|
|
// flatten out values in 'bits' assuming that they are are to have values of idx
|
|
// plus their position in the bitvector, and store these indexes at
|
|
// base_ptr[base] incrementing base as we go
|
|
// will potentially store extra values beyond end of valid bits, so base_ptr
|
|
// needs to be large enough to handle this
|
|
//
|
|
// If the kernel sets SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER, then it
|
|
// will provide its own version of the code.
|
|
#ifdef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits);
|
|
#else
|
|
simdjson_inline void write(uint32_t idx, uint64_t bits) {
|
|
// In some instances, the next branch is expensive because it is mispredicted.
|
|
// Unfortunately, in other cases,
|
|
// it helps tremendously.
|
|
if (bits == 0)
|
|
return;
|
|
|
|
int cnt = static_cast<int>(count_ones(bits));
|
|
|
|
#if SIMDJSON_PREFER_REVERSE_BITS
|
|
bits = reverse_bits(bits);
|
|
#endif
|
|
#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
|
|
static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
|
|
#else
|
|
static constexpr const int STEP = 4;
|
|
#endif
|
|
static constexpr const int STEP_UNTIL = 24;
|
|
|
|
write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
|
|
SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
|
|
if (simdjson_unlikely(STEP_UNTIL < cnt)) {
|
|
for (int i=STEP_UNTIL; i<cnt; i++) {
|
|
write_index(idx, bits, i);
|
|
}
|
|
}
|
|
}
|
|
|
|
this->tail += cnt;
|
|
}
|
|
#endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
};
|
|
|
|
class json_structural_indexer {
|
|
public:
|
|
/**
|
|
* Find the important bits of JSON in a 128-byte chunk, and add them to structural_indexes.
|
|
*
|
|
* @param partial Setting the partial parameter to true allows the find_structural_bits to
|
|
* tolerate unclosed strings. The caller should still ensure that the input is valid UTF-8. If
|
|
* you are processing substrings, you may want to call on a function like trimmed_length_safe_utf8.
|
|
*/
|
|
template<size_t STEP_SIZE>
|
|
static error_code index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept;
|
|
|
|
private:
|
|
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
|
|
template<size_t STEP_SIZE>
|
|
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
|
|
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
|
|
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
|
|
|
|
json_scanner scanner{};
|
|
utf8_checker checker{};
|
|
bit_indexer indexer;
|
|
uint64_t prev_structurals = 0;
|
|
uint64_t unescaped_chars_error = 0;
|
|
};
|
|
|
|
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
|
|
|
|
// Skip the last character if it is partial
|
|
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
|
|
if (simdjson_unlikely(len < 3)) {
|
|
switch (len) {
|
|
case 2:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
|
|
return len;
|
|
case 1:
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
return len;
|
|
case 0:
|
|
return len;
|
|
}
|
|
}
|
|
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
|
|
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
|
|
return len;
|
|
}
|
|
|
|
//
|
|
// PERF NOTES:
|
|
// We pipe 2 inputs through these stages:
|
|
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
|
|
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
|
|
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
|
|
// The output of step 1 depends entirely on this information. These functions don't quite use
|
|
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
|
|
// at a time. The second input's scans has some dependency on the first ones finishing it, but
|
|
// they can make a lot of progress before they need that information.
|
|
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
|
|
// to finish: utf-8 checks and generating the output from the last iteration.
|
|
//
|
|
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
|
|
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
|
|
// workout.
|
|
//
|
|
template<size_t STEP_SIZE>
|
|
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
|
|
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
|
|
// We guard the rest of the code so that we can assume that len > 0 throughout.
|
|
if (len == 0) { return EMPTY; }
|
|
if (is_streaming(partial)) {
|
|
len = trim_partial_utf8(buf, len);
|
|
// If you end up with an empty window after trimming
|
|
// the partial UTF-8 bytes, then chances are good that you
|
|
// have an UTF-8 formatting error.
|
|
if(len == 0) { return UTF8_ERROR; }
|
|
}
|
|
buf_block_reader<STEP_SIZE> reader(buf, len);
|
|
json_structural_indexer indexer(parser.structural_indexes.get());
|
|
|
|
// Read all but the last block
|
|
while (reader.has_full_block()) {
|
|
indexer.step<STEP_SIZE>(reader.full_block(), reader);
|
|
}
|
|
// Take care of the last block (will always be there unless file is empty which is
|
|
// not supposed to happen.)
|
|
uint8_t block[STEP_SIZE];
|
|
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
|
|
indexer.step<STEP_SIZE>(block, reader);
|
|
return indexer.finish(parser, reader.block_index(), len, partial);
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
simd::simd8x64<uint8_t> in_2(block+64);
|
|
json_block block_1 = scanner.next(in_1);
|
|
json_block block_2 = scanner.next(in_2);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
this->next(in_2, block_2, reader.block_index()+64);
|
|
reader.advance();
|
|
}
|
|
|
|
template<>
|
|
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
|
|
simd::simd8x64<uint8_t> in_1(block);
|
|
json_block block_1 = scanner.next(in_1);
|
|
this->next(in_1, block_1, reader.block_index());
|
|
reader.advance();
|
|
}
|
|
|
|
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
|
|
uint64_t unescaped = in.lteq(0x1F);
|
|
#if SIMDJSON_UTF8VALIDATION
|
|
checker.check_next_input(in);
|
|
#endif
|
|
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
|
|
prev_structurals = block.structural_start();
|
|
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
|
|
}
|
|
|
|
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
|
|
// Write out the final iteration's structurals
|
|
indexer.write(uint32_t(idx-64), prev_structurals);
|
|
error_code error = scanner.finish();
|
|
// We deliberately break down the next expression so that it is
|
|
// human readable.
|
|
const bool should_we_exit = is_streaming(partial) ?
|
|
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
|
|
: (error != SUCCESS); // if partial is false, we must have SUCCESS
|
|
const bool have_unclosed_string = (error == UNCLOSED_STRING);
|
|
if (simdjson_unlikely(should_we_exit)) { return error; }
|
|
|
|
if (unescaped_chars_error) {
|
|
return UNESCAPED_CHARS;
|
|
}
|
|
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
|
|
/***
|
|
* The On Demand API requires special padding.
|
|
*
|
|
* This is related to https://github.com/simdjson/simdjson/issues/906
|
|
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
|
|
* structural character, it quickly stops.
|
|
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
|
|
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
|
|
* continues, then it must be [,] or }.
|
|
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
|
|
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
|
|
* It can't be a comma, a colon or any simple value. So the only way we could continue is
|
|
* if the repeated character is [. But if so, the document must start with [. But if the document
|
|
* starts with [, it should end with ]. If we enforce that rule, then we would get
|
|
* ][[ which is invalid.
|
|
*
|
|
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
|
|
* R"({ "a": [,,)"
|
|
**/
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
|
|
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
|
|
parser.next_structural_index = 0;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
return EMPTY;
|
|
}
|
|
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
|
|
return UNEXPECTED_ERROR;
|
|
}
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
// If we have an unclosed string, then the last structural
|
|
// will be the quote and we want to make sure to omit it.
|
|
if(have_unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
// a valid JSON file cannot have zero structural indexes - we should have found something
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if (partial == stage1_mode::streaming_final) {
|
|
if(have_unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
|
|
// We tolerate an unclosed string at the very end of the stream. Indeed, users
|
|
// often load their data in bulk without being careful and they want us to ignore
|
|
// the trailing garbage.
|
|
return EMPTY;
|
|
}
|
|
}
|
|
checker.check_eof();
|
|
return checker.errors();
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
|
|
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
|
|
/* end file generic/stage1/json_structural_indexer.h for lasx */
|
|
/* including generic/stage1/utf8_validator.h for lasx: #include <generic/stage1/utf8_validator.h> */
|
|
/* begin file generic/stage1/utf8_validator.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* Validates that the string is actual UTF-8.
|
|
*/
|
|
template<class checker>
|
|
bool generic_validate_utf8(const uint8_t * input, size_t length) {
|
|
checker c{};
|
|
buf_block_reader<64> reader(input, length);
|
|
while (reader.has_full_block()) {
|
|
simd::simd8x64<uint8_t> in(reader.full_block());
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
}
|
|
uint8_t block[64]{};
|
|
reader.get_remainder(block);
|
|
simd::simd8x64<uint8_t> in(block);
|
|
c.check_next_input(in);
|
|
reader.advance();
|
|
c.check_eof();
|
|
return c.errors() == error_code::SUCCESS;
|
|
}
|
|
|
|
bool generic_validate_utf8(const char * input, size_t length) {
|
|
return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
|
|
/* end file generic/stage1/utf8_validator.h for lasx */
|
|
/* end file generic/stage1/amalgamated.h for lasx */
|
|
/* including generic/stage2/amalgamated.h for lasx: #include <generic/stage2/amalgamated.h> */
|
|
/* begin file generic/stage2/amalgamated.h for lasx */
|
|
// Stuff other things depend on
|
|
/* including generic/stage2/base.h for lasx: #include <generic/stage2/base.h> */
|
|
/* begin file generic/stage2/base.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator;
|
|
class structural_iterator;
|
|
struct tape_builder;
|
|
struct tape_writer;
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_BASE_H
|
|
/* end file generic/stage2/base.h for lasx */
|
|
/* including generic/stage2/tape_writer.h for lasx: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for lasx */
|
|
/* including generic/stage2/logger.h for lasx: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for lasx */
|
|
|
|
// All other declarations
|
|
/* including generic/stage2/json_iterator.h for lasx: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for lasx */
|
|
/* including generic/stage2/stringparsing.h for lasx: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for lasx */
|
|
/* including generic/stage2/structural_iterator.h for lasx: #include <generic/stage2/structural_iterator.h> */
|
|
/* begin file generic/stage2/structural_iterator.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class structural_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
|
|
// Start a structural
|
|
simdjson_inline structural_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
// Get the buffer position of the current structural character
|
|
simdjson_inline const uint8_t* current() {
|
|
return &buf[*(next_structural-1)];
|
|
}
|
|
// Get the current structural character
|
|
simdjson_inline char current_char() {
|
|
return buf[*(next_structural-1)];
|
|
}
|
|
// Get the next structural character without advancing
|
|
simdjson_inline char peek_next_char() {
|
|
return buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* peek() {
|
|
return &buf[*next_structural];
|
|
}
|
|
simdjson_inline const uint8_t* advance() {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline char advance_char() {
|
|
return buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t remaining_len() {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool at_end() {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool at_beginning() {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
};
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRUCTURAL_ITERATOR_H
|
|
/* end file generic/stage2/structural_iterator.h for lasx */
|
|
/* including generic/stage2/tape_builder.h for lasx: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for lasx */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for lasx */
|
|
/* end file generic/stage2/amalgamated.h for lasx */
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
using namespace simd;
|
|
|
|
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
|
|
// Inspired by haswell.
|
|
// LASX use low 5 bits as index. For the 6 operators (:,[]{}), the unique-5bits is [6:2].
|
|
// The ASCII white-space and operators have these values: (char, hex, unique-5bits)
|
|
// (' ', 20, 00000) ('\t', 09, 01001) ('\n', 0A, 01010) ('\r', 0D, 01101)
|
|
// (',', 2C, 01011) (':', 3A, 01110) ('[', 5B, 10110) ('{', 7B, 11110) (']', 5D, 10111) ('}', 7D, 11111)
|
|
const simd8<uint8_t> ws_table = simd8<uint8_t>::repeat_16(
|
|
' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0
|
|
);
|
|
const simd8<uint8_t> op_table_lo = simd8<uint8_t>::repeat_16(
|
|
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ',', 0, 0, ':', 0
|
|
);
|
|
const simd8<uint8_t> op_table_hi = simd8<uint8_t>::repeat_16(
|
|
0, 0, 0, 0, 0, 0, '[', ']', 0, 0, 0, 0, 0, 0, '{', '}'
|
|
);
|
|
uint64_t ws = in.eq({
|
|
in.chunks[0].lookup_16(ws_table),
|
|
in.chunks[1].lookup_16(ws_table),
|
|
});
|
|
uint64_t op = in.eq({
|
|
__lasx_xvshuf_b(op_table_hi, op_table_lo, in.chunks[0].shr<2>()),
|
|
__lasx_xvshuf_b(op_table_hi, op_table_lo, in.chunks[1].shr<2>()),
|
|
});
|
|
|
|
return { ws, op };
|
|
}
|
|
|
|
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
|
|
return input.reduce_or().is_ascii();
|
|
}
|
|
|
|
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
|
|
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
|
|
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
|
|
return is_third_byte | is_fourth_byte;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
//
|
|
// Implementation-specific overrides
|
|
//
|
|
namespace simdjson {
|
|
namespace lasx {
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
return lasx::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
return lasx::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
|
|
}
|
|
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
return lasx::stage1::generic_validate_utf8(buf,len);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept {
|
|
return lasx::stringparsing::parse_string(src, dst, allow_replacement);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return lasx::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace lasx
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/lasx/end.h: #include <simdjson/lasx/end.h> */
|
|
/* begin file simdjson/lasx/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "lasx" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/lasx/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_LASX_CPP
|
|
/* end file lasx.cpp */
|
|
#endif
|
|
#if SIMDJSON_IMPLEMENTATION_FALLBACK
|
|
/* including fallback.cpp: #include <fallback.cpp> */
|
|
/* begin file fallback.cpp */
|
|
#ifndef SIMDJSON_SRC_FALLBACK_CPP
|
|
#define SIMDJSON_SRC_FALLBACK_CPP
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include <base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* including simdjson/fallback.h: #include <simdjson/fallback.h> */
|
|
/* begin file simdjson/fallback.h */
|
|
#ifndef SIMDJSON_FALLBACK_H
|
|
#define SIMDJSON_FALLBACK_H
|
|
|
|
/* including simdjson/fallback/begin.h: #include "simdjson/fallback/begin.h" */
|
|
/* begin file simdjson/fallback/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "fallback" */
|
|
#define SIMDJSON_IMPLEMENTATION fallback
|
|
/* including simdjson/fallback/base.h: #include "simdjson/fallback/base.h" */
|
|
/* begin file simdjson/fallback/base.h */
|
|
#ifndef SIMDJSON_FALLBACK_BASE_H
|
|
#define SIMDJSON_FALLBACK_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Fallback implementation (runs on any machine).
|
|
*/
|
|
namespace fallback {
|
|
|
|
class implementation;
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_BASE_H
|
|
/* end file simdjson/fallback/base.h */
|
|
/* including simdjson/fallback/bitmanipulation.h: #include "simdjson/fallback/bitmanipulation.h" */
|
|
/* begin file simdjson/fallback/bitmanipulation.h */
|
|
#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H
|
|
#define SIMDJSON_FALLBACK_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
|
|
#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
|
|
static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
|
|
unsigned long x0 = (unsigned long)x, top, bottom;
|
|
_BitScanForward(&top, (unsigned long)(x >> 32));
|
|
_BitScanForward(&bottom, x0);
|
|
*ret = x0 ? bottom : 32 + top;
|
|
return x != 0;
|
|
}
|
|
static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
|
|
unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
|
|
_BitScanReverse(&top, x1);
|
|
_BitScanReverse(&bottom, (unsigned long)x);
|
|
*ret = x1 ? top + 32 : bottom;
|
|
return x != 0;
|
|
}
|
|
#endif
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#ifdef _MSC_VER
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// _MSC_VER
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H
|
|
/* end file simdjson/fallback/bitmanipulation.h */
|
|
/* including simdjson/fallback/stringparsing_defs.h: #include "simdjson/fallback/stringparsing_defs.h" */
|
|
/* begin file simdjson/fallback/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 1;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return c == '"'; }
|
|
simdjson_inline bool has_backslash() { return c == '\\'; }
|
|
simdjson_inline int quote_index() { return c == '"' ? 0 : 1; }
|
|
simdjson_inline int backslash_index() { return c == '\\' ? 0 : 1; }
|
|
|
|
uint8_t c;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// store to dest unconditionally - we can overwrite the bits we don't like later
|
|
dst[0] = src[0];
|
|
return { src[0] };
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/fallback/stringparsing_defs.h */
|
|
/* including simdjson/fallback/numberparsing_defs.h: #include "simdjson/fallback/numberparsing_defs.h" */
|
|
/* begin file simdjson/fallback/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
#ifdef JSON_TEST_NUMBERS // for unit testing
|
|
void found_invalid_number(const uint8_t *buf);
|
|
void found_integer(int64_t result, const uint8_t *buf);
|
|
void found_unsigned_integer(uint64_t result, const uint8_t *buf);
|
|
void found_float(double result, const uint8_t *buf);
|
|
#endif
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace numberparsing {
|
|
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
|
uint64_t val;
|
|
memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
return parse_eight_digits_unrolled(reinterpret_cast<const char *>(chars));
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/fallback/numberparsing_defs.h */
|
|
/* end file simdjson/fallback/begin.h */
|
|
/* including simdjson/generic/amalgamated.h for fallback: #include "simdjson/generic/amalgamated.h" */
|
|
/* begin file simdjson/generic/amalgamated.h for fallback */
|
|
#if defined(SIMDJSON_CONDITIONAL_INCLUDE) && !defined(SIMDJSON_GENERIC_DEPENDENCIES_H)
|
|
#error simdjson/generic/dependencies.h must be included before simdjson/generic/amalgamated.h!
|
|
#endif
|
|
|
|
/* including simdjson/generic/base.h for fallback: #include "simdjson/generic/base.h" */
|
|
/* begin file simdjson/generic/base.h for fallback */
|
|
#ifndef SIMDJSON_GENERIC_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): // If we haven't got an implementation yet, we're in the editor, editing a generic file! Just */
|
|
/* amalgamation skipped (editor-only): // use the most advanced one we can so the most possible stuff can be tested. */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation_detection.h" */
|
|
/* amalgamation skipped (editor-only): #if SIMDJSON_IMPLEMENTATION_ICELAKE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/icelake/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_HASWELL */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/haswell/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_WESTMERE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/westmere/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_ARM64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/arm64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_PPC64 */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LSX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lsx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_LASX */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/lasx/begin.h" */
|
|
/* amalgamation skipped (editor-only): #elif SIMDJSON_IMPLEMENTATION_FALLBACK */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/begin.h" */
|
|
/* amalgamation skipped (editor-only): #else */
|
|
/* amalgamation skipped (editor-only): #error "All possible implementations (including fallback) have been disabled! simdjson will not run." */
|
|
/* amalgamation skipped (editor-only): #endif */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_IMPLEMENTATION */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
struct open_container;
|
|
class dom_parser_implementation;
|
|
|
|
/**
|
|
* The type of a JSON number
|
|
*/
|
|
enum class number_type {
|
|
floating_point_number=1, /// a binary64 number
|
|
signed_integer, /// a signed integer that fits in a 64-bit word using two's complement
|
|
unsigned_integer, /// a positive integer larger or equal to 1<<63
|
|
big_integer /// a big integer that does not fit in a 64-bit word
|
|
};
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_BASE_H
|
|
/* end file simdjson/generic/base.h for fallback */
|
|
/* including simdjson/generic/jsoncharutils.h for fallback: #include "simdjson/generic/jsoncharutils.h" */
|
|
/* begin file simdjson/generic/jsoncharutils.h for fallback */
|
|
#ifndef SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_JSONCHARUTILS_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/jsoncharutils_tables.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
namespace jsoncharutils {
|
|
|
|
// return non-zero if not a structural or whitespace char
|
|
// zero otherwise
|
|
simdjson_inline uint32_t is_not_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace_negated[c];
|
|
}
|
|
|
|
simdjson_inline uint32_t is_structural_or_whitespace(uint8_t c) {
|
|
return internal::structural_or_whitespace[c];
|
|
}
|
|
|
|
// returns a value with the high 16 bits set if not valid
|
|
// otherwise returns the conversion of the 4 hex digits at src into the bottom
|
|
// 16 bits of the 32-bit return register
|
|
//
|
|
// see
|
|
// https://lemire.me/blog/2019/04/17/parsing-short-hexadecimal-strings-efficiently/
|
|
static inline uint32_t hex_to_u32_nocheck(
|
|
const uint8_t *src) { // strictly speaking, static inline is a C-ism
|
|
uint32_t v1 = internal::digit_to_val32[630 + src[0]];
|
|
uint32_t v2 = internal::digit_to_val32[420 + src[1]];
|
|
uint32_t v3 = internal::digit_to_val32[210 + src[2]];
|
|
uint32_t v4 = internal::digit_to_val32[0 + src[3]];
|
|
return v1 | v2 | v3 | v4;
|
|
}
|
|
|
|
// given a code point cp, writes to c
|
|
// the utf-8 code, outputting the length in
|
|
// bytes, if the length is zero, the code point
|
|
// is invalid
|
|
//
|
|
// This can possibly be made faster using pdep
|
|
// and clz and table lookups, but JSON documents
|
|
// have few escaped code points, and the following
|
|
// function looks cheap.
|
|
//
|
|
// Note: we assume that surrogates are treated separately
|
|
//
|
|
simdjson_inline size_t codepoint_to_utf8(uint32_t cp, uint8_t *c) {
|
|
if (cp <= 0x7F) {
|
|
c[0] = uint8_t(cp);
|
|
return 1; // ascii
|
|
}
|
|
if (cp <= 0x7FF) {
|
|
c[0] = uint8_t((cp >> 6) + 192);
|
|
c[1] = uint8_t((cp & 63) + 128);
|
|
return 2; // universal plane
|
|
// Surrogates are treated elsewhere...
|
|
//} //else if (0xd800 <= cp && cp <= 0xdfff) {
|
|
// return 0; // surrogates // could put assert here
|
|
} else if (cp <= 0xFFFF) {
|
|
c[0] = uint8_t((cp >> 12) + 224);
|
|
c[1] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[2] = uint8_t((cp & 63) + 128);
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) { // if you know you have a valid code point, this
|
|
// is not needed
|
|
c[0] = uint8_t((cp >> 18) + 240);
|
|
c[1] = uint8_t(((cp >> 12) & 63) + 128);
|
|
c[2] = uint8_t(((cp >> 6) & 63) + 128);
|
|
c[3] = uint8_t((cp & 63) + 128);
|
|
return 4;
|
|
}
|
|
// will return 0 when the code point was too large.
|
|
return 0; // bad r
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
} // namespace jsoncharutils
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_JSONCHARUTILS_H
|
|
/* end file simdjson/generic/jsoncharutils.h for fallback */
|
|
/* including simdjson/generic/atomparsing.h for fallback: #include "simdjson/generic/atomparsing.h" */
|
|
/* begin file simdjson/generic/atomparsing.h for fallback */
|
|
#ifndef SIMDJSON_GENERIC_ATOMPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_ATOMPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
/// @private
|
|
namespace atomparsing {
|
|
|
|
// The string_to_uint32 is exclusively used to map literal strings to 32-bit values.
|
|
// We use memcpy instead of a pointer cast to avoid undefined behaviors since we cannot
|
|
// be certain that the character pointer will be properly aligned.
|
|
// You might think that using memcpy makes this function expensive, but you'd be wrong.
|
|
// All decent optimizing compilers (GCC, clang, Visual Studio) will compile string_to_uint32("false");
|
|
// to the compile-time constant 1936482662.
|
|
simdjson_inline uint32_t string_to_uint32(const char* str) { uint32_t val; std::memcpy(&val, str, sizeof(uint32_t)); return val; }
|
|
|
|
|
|
// Again in str4ncmp we use a memcpy to avoid undefined behavior. The memcpy may appear expensive.
|
|
// Yet all decent optimizing compilers will compile memcpy to a single instruction, just about.
|
|
simdjson_warn_unused
|
|
simdjson_inline uint32_t str4ncmp(const uint8_t *src, const char* atom) {
|
|
uint32_t srcval; // we want to avoid unaligned 32-bit loads (undefined in C/C++)
|
|
static_assert(sizeof(uint32_t) <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be larger than 4 bytes");
|
|
std::memcpy(&srcval, src, sizeof(uint32_t));
|
|
return srcval ^ string_to_uint32(atom);
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "true") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_true_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_true_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "true"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src) {
|
|
return (str4ncmp(src+1, "alse") | jsoncharutils::is_not_structural_or_whitespace(src[5])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_false_atom(const uint8_t *src, size_t len) {
|
|
if (len > 5) { return is_valid_false_atom(src); }
|
|
else if (len == 5) { return !str4ncmp(src+1, "alse"); }
|
|
else { return false; }
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src) {
|
|
return (str4ncmp(src, "null") | jsoncharutils::is_not_structural_or_whitespace(src[4])) == 0;
|
|
}
|
|
|
|
simdjson_warn_unused
|
|
simdjson_inline bool is_valid_null_atom(const uint8_t *src, size_t len) {
|
|
if (len > 4) { return is_valid_null_atom(src); }
|
|
else if (len == 4) { return !str4ncmp(src, "null"); }
|
|
else { return false; }
|
|
}
|
|
|
|
} // namespace atomparsing
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_ATOMPARSING_H
|
|
/* end file simdjson/generic/atomparsing.h for fallback */
|
|
/* including simdjson/generic/dom_parser_implementation.h for fallback: #include "simdjson/generic/dom_parser_implementation.h" */
|
|
/* begin file simdjson/generic/dom_parser_implementation.h for fallback */
|
|
#ifndef SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/dom_parser_implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
// expectation: sizeof(open_container) = 64/8.
|
|
struct open_container {
|
|
uint32_t tape_index; // where, on the tape, does the scope ([,{) begins
|
|
uint32_t count; // how many elements in the scope
|
|
}; // struct open_container
|
|
|
|
static_assert(sizeof(open_container) == 64/8, "Open container must be 64 bits");
|
|
|
|
class dom_parser_implementation final : public internal::dom_parser_implementation {
|
|
public:
|
|
/** Tape location of each open { or [ */
|
|
std::unique_ptr<open_container[]> open_containers{};
|
|
/** Whether each open container is a [ or { */
|
|
std::unique_ptr<bool[]> is_array{};
|
|
/** Buffer passed to stage 1 */
|
|
const uint8_t *buf{};
|
|
/** Length passed to stage 1 */
|
|
size_t len{0};
|
|
/** Document passed to stage 2 */
|
|
dom::document *doc{};
|
|
|
|
inline dom_parser_implementation() noexcept;
|
|
inline dom_parser_implementation(dom_parser_implementation &&other) noexcept;
|
|
inline dom_parser_implementation &operator=(dom_parser_implementation &&other) noexcept;
|
|
dom_parser_implementation(const dom_parser_implementation &) = delete;
|
|
dom_parser_implementation &operator=(const dom_parser_implementation &) = delete;
|
|
|
|
simdjson_warn_unused error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage1(const uint8_t *buf, size_t len, stage1_mode partial) noexcept final;
|
|
simdjson_warn_unused error_code stage2(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused error_code stage2_next(dom::document &doc) noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept final;
|
|
simdjson_warn_unused uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept final;
|
|
inline simdjson_warn_unused error_code set_capacity(size_t capacity) noexcept final;
|
|
inline simdjson_warn_unused error_code set_max_depth(size_t max_depth) noexcept final;
|
|
private:
|
|
simdjson_inline simdjson_warn_unused error_code set_capacity_stage1(size_t capacity);
|
|
|
|
};
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
inline dom_parser_implementation::dom_parser_implementation() noexcept = default;
|
|
inline dom_parser_implementation::dom_parser_implementation(dom_parser_implementation &&other) noexcept = default;
|
|
inline dom_parser_implementation &dom_parser_implementation::operator=(dom_parser_implementation &&other) noexcept = default;
|
|
|
|
// Leaving these here so they can be inlined if so desired
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_capacity(size_t capacity) noexcept {
|
|
if(capacity > SIMDJSON_MAXSIZE_BYTES) { return CAPACITY; }
|
|
// Stage 1 index output
|
|
size_t max_structures = SIMDJSON_ROUNDUP_N(capacity, 64) + 2 + 7;
|
|
structural_indexes.reset( new (std::nothrow) uint32_t[max_structures] );
|
|
if (!structural_indexes) { _capacity = 0; return MEMALLOC; }
|
|
structural_indexes[0] = 0;
|
|
n_structural_indexes = 0;
|
|
|
|
_capacity = capacity;
|
|
return SUCCESS;
|
|
}
|
|
|
|
inline simdjson_warn_unused error_code dom_parser_implementation::set_max_depth(size_t max_depth) noexcept {
|
|
// Stage 2 stacks
|
|
open_containers.reset(new (std::nothrow) open_container[max_depth]);
|
|
is_array.reset(new (std::nothrow) bool[max_depth]);
|
|
if (!is_array || !open_containers) { _max_depth = 0; return MEMALLOC; }
|
|
|
|
_max_depth = max_depth;
|
|
return SUCCESS;
|
|
}
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_DOM_PARSER_IMPLEMENTATION_H
|
|
/* end file simdjson/generic/dom_parser_implementation.h for fallback */
|
|
/* including simdjson/generic/implementation_simdjson_result_base.h for fallback: #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base.h for fallback */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
// This is a near copy of include/error.h's implementation_simdjson_result_base, except it doesn't use std::pair
|
|
// so we can avoid inlining errors
|
|
// TODO reconcile these!
|
|
/**
|
|
* The result of a simdjson operation that could fail.
|
|
*
|
|
* Gives the option of reading error codes, or throwing an exception by casting to the desired result.
|
|
*
|
|
* This is a base class for implementations that want to add functions to the result type for
|
|
* chaining.
|
|
*
|
|
* Override like:
|
|
*
|
|
* struct simdjson_result<T> : public internal::implementation_simdjson_result_base<T> {
|
|
* simdjson_result() noexcept : internal::implementation_simdjson_result_base<T>() {}
|
|
* simdjson_result(error_code error) noexcept : internal::implementation_simdjson_result_base<T>(error) {}
|
|
* simdjson_result(T &&value) noexcept : internal::implementation_simdjson_result_base<T>(std::forward(value)) {}
|
|
* simdjson_result(T &&value, error_code error) noexcept : internal::implementation_simdjson_result_base<T>(value, error) {}
|
|
* // Your extra methods here
|
|
* }
|
|
*
|
|
* Then any method returning simdjson_result<T> will be chainable with your methods.
|
|
*/
|
|
template<typename T>
|
|
struct implementation_simdjson_result_base {
|
|
|
|
/**
|
|
* Create a new empty result with error = UNINITIALIZED.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base() noexcept = default;
|
|
|
|
/**
|
|
* Create a new error result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(error_code error) noexcept;
|
|
|
|
/**
|
|
* Create a new successful result.
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value) noexcept;
|
|
|
|
/**
|
|
* Create a new result with both things (use if you don't want to branch when creating the result).
|
|
*/
|
|
simdjson_inline implementation_simdjson_result_base(T &&value, error_code error) noexcept;
|
|
|
|
/**
|
|
* Move the value and the error to the provided variables.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
* @param error The variable to assign the error to. Set to SUCCESS if there is no error.
|
|
*/
|
|
simdjson_inline void tie(T &value, error_code &error) && noexcept;
|
|
|
|
/**
|
|
* Move the value to the provided variable.
|
|
*
|
|
* @param value The variable to assign the value to. May not be set if there is an error.
|
|
*/
|
|
simdjson_inline error_code get(T &value) && noexcept;
|
|
|
|
/**
|
|
* The error.
|
|
*/
|
|
simdjson_inline error_code error() const noexcept;
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value.
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T& value() & noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& value() && noexcept(false);
|
|
|
|
/**
|
|
* Take the result value (move it).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline T&& take_value() && noexcept(false);
|
|
|
|
/**
|
|
* Cast to the value (will throw on error).
|
|
*
|
|
* @throw simdjson_error if there was an error.
|
|
*/
|
|
simdjson_inline operator T&&() && noexcept(false);
|
|
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline const T& value_unsafe() const& noexcept;
|
|
/**
|
|
* Get the result value. This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T& value_unsafe() & noexcept;
|
|
/**
|
|
* Take the result value (move it). This function is safe if and only
|
|
* the error() method returns a value that evaluates to false.
|
|
*/
|
|
simdjson_inline T&& value_unsafe() && noexcept;
|
|
protected:
|
|
/** users should never directly access first and second. **/
|
|
T first{}; /** Users should never directly access 'first'. **/
|
|
error_code second{UNINITIALIZED}; /** Users should never directly access 'second'. **/
|
|
}; // struct implementation_simdjson_result_base
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base.h for fallback */
|
|
/* including simdjson/generic/numberparsing.h for fallback: #include "simdjson/generic/numberparsing.h" */
|
|
/* begin file simdjson/generic/numberparsing.h for fallback */
|
|
#ifndef SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_NUMBERPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/jsoncharutils.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <limits>
|
|
#include <ostream>
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace numberparsing {
|
|
|
|
#ifdef JSON_TEST_NUMBERS
|
|
#define INVALID_NUMBER(SRC) (found_invalid_number((SRC)), NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (found_integer((VALUE), (SRC)), (WRITER).append_s64((VALUE)))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (found_unsigned_integer((VALUE), (SRC)), (WRITER).append_u64((VALUE)))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (found_float((VALUE), (SRC)), (WRITER).append_double((VALUE)))
|
|
#define BIGINT_NUMBER(SRC) (found_invalid_number((SRC)), BIGINT_ERROR)
|
|
#else
|
|
#define INVALID_NUMBER(SRC) (NUMBER_ERROR)
|
|
#define WRITE_INTEGER(VALUE, SRC, WRITER) (WRITER).append_s64((VALUE))
|
|
#define WRITE_UNSIGNED(VALUE, SRC, WRITER) (WRITER).append_u64((VALUE))
|
|
#define WRITE_DOUBLE(VALUE, SRC, WRITER) (WRITER).append_double((VALUE))
|
|
#define BIGINT_NUMBER(SRC) (BIGINT_ERROR)
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
// Convert a mantissa, an exponent and a sign bit into an ieee64 double.
|
|
// The real_exponent needs to be in [0, 2046] (technically real_exponent = 2047 would be acceptable).
|
|
// The mantissa should be in [0,1<<53). The bit at index (1ULL << 52) while be zeroed.
|
|
simdjson_inline double to_double(uint64_t mantissa, uint64_t real_exponent, bool negative) {
|
|
double d;
|
|
mantissa &= ~(1ULL << 52);
|
|
mantissa |= real_exponent << 52;
|
|
mantissa |= ((static_cast<uint64_t>(negative)) << 63);
|
|
std::memcpy(&d, &mantissa, sizeof(d));
|
|
return d;
|
|
}
|
|
|
|
// Attempts to compute i * 10^(power) exactly; and if "negative" is
|
|
// true, negate the result.
|
|
// This function will only work in some cases, when it does not work, success is
|
|
// set to false. This should work *most of the time* (like 99% of the time).
|
|
// We assume that power is in the [smallest_power,
|
|
// largest_power] interval: the caller is responsible for this check.
|
|
simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, double &d) {
|
|
// we start with a fast path
|
|
// It was described in
|
|
// Clinger WD. How to read floating point numbers accurately.
|
|
// ACM SIGPLAN Notices. 1990
|
|
#ifndef FLT_EVAL_METHOD
|
|
#error "FLT_EVAL_METHOD should be defined, please include cfloat."
|
|
#endif
|
|
#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
|
|
// We cannot be certain that x/y is rounded to nearest.
|
|
if (0 <= power && power <= 22 && i <= 9007199254740991)
|
|
#else
|
|
if (-22 <= power && power <= 22 && i <= 9007199254740991)
|
|
#endif
|
|
{
|
|
// convert the integer into a double. This is lossless since
|
|
// 0 <= i <= 2^53 - 1.
|
|
d = double(i);
|
|
//
|
|
// The general idea is as follows.
|
|
// If 0 <= s < 2^53 and if 10^0 <= p <= 10^22 then
|
|
// 1) Both s and p can be represented exactly as 64-bit floating-point
|
|
// values
|
|
// (binary64).
|
|
// 2) Because s and p can be represented exactly as floating-point values,
|
|
// then s * p
|
|
// and s / p will produce correctly rounded values.
|
|
//
|
|
if (power < 0) {
|
|
d = d / simdjson::internal::power_of_ten[-power];
|
|
} else {
|
|
d = d * simdjson::internal::power_of_ten[power];
|
|
}
|
|
if (negative) {
|
|
d = -d;
|
|
}
|
|
return true;
|
|
}
|
|
// When 22 < power && power < 22 + 16, we could
|
|
// hope for another, secondary fast path. It was
|
|
// described by David M. Gay in "Correctly rounded
|
|
// binary-decimal and decimal-binary conversions." (1990)
|
|
// If you need to compute i * 10^(22 + x) for x < 16,
|
|
// first compute i * 10^x, if you know that result is exact
|
|
// (e.g., when i * 10^x < 2^53),
|
|
// then you can still proceed and do (i * 10^x) * 10^22.
|
|
// Is this worth your time?
|
|
// You need 22 < power *and* power < 22 + 16 *and* (i * 10^(x-22) < 2^53)
|
|
// for this second fast path to work.
|
|
// If you you have 22 < power *and* power < 22 + 16, and then you
|
|
// optimistically compute "i * 10^(x-22)", there is still a chance that you
|
|
// have wasted your time if i * 10^(x-22) >= 2^53. It makes the use cases of
|
|
// this optimization maybe less common than we would like. Source:
|
|
// http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/
|
|
// also used in RapidJSON: https://rapidjson.org/strtod_8h_source.html
|
|
|
|
// The fast path has now failed, so we are failing back on the slower path.
|
|
|
|
// In the slow path, we need to adjust i so that it is > 1<<63 which is always
|
|
// possible, except if i == 0, so we handle i == 0 separately.
|
|
if(i == 0) {
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
|
|
|
|
// The exponent is 1024 + 63 + power
|
|
// + floor(log(5**power)/log(2)).
|
|
// The 1024 comes from the ieee64 standard.
|
|
// The 63 comes from the fact that we use a 64-bit word.
|
|
//
|
|
// Computing floor(log(5**power)/log(2)) could be
|
|
// slow. Instead we use a fast function.
|
|
//
|
|
// For power in (-400,350), we have that
|
|
// (((152170 + 65536) * power ) >> 16);
|
|
// is equal to
|
|
// floor(log(5**power)/log(2)) + power when power >= 0
|
|
// and it is equal to
|
|
// ceil(log(5**-power)/log(2)) + power when power < 0
|
|
//
|
|
// The 65536 is (1<<16) and corresponds to
|
|
// (65536 * power) >> 16 ---> power
|
|
//
|
|
// ((152170 * power ) >> 16) is equal to
|
|
// floor(log(5**power)/log(2))
|
|
//
|
|
// Note that this is not magic: 152170/(1<<16) is
|
|
// approximatively equal to log(5)/log(2).
|
|
// The 1<<16 value is a power of two; we could use a
|
|
// larger power of 2 if we wanted to.
|
|
//
|
|
int64_t exponent = (((152170 + 65536) * power) >> 16) + 1024 + 63;
|
|
|
|
|
|
// We want the most significant bit of i to be 1. Shift if needed.
|
|
int lz = leading_zeroes(i);
|
|
i <<= lz;
|
|
|
|
|
|
// We are going to need to do some 64-bit arithmetic to get a precise product.
|
|
// We use a table lookup approach.
|
|
// It is safe because
|
|
// power >= smallest_power
|
|
// and power <= largest_power
|
|
// We recover the mantissa of the power, it has a leading 1. It is always
|
|
// rounded down.
|
|
//
|
|
// We want the most significant 64 bits of the product. We know
|
|
// this will be non-zero because the most significant bit of i is
|
|
// 1.
|
|
const uint32_t index = 2 * uint32_t(power - simdjson::internal::smallest_power);
|
|
// Optimization: It may be that materializing the index as a variable might confuse some compilers and prevent effective complex-addressing loads. (Done for code clarity.)
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 firstproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index]);
|
|
// Both i and power_of_five_128[index] have their most significant bit set to 1 which
|
|
// implies that the either the most or the second most significant bit of the product
|
|
// is 1. We pack values in this manner for efficiency reasons: it maximizes the use
|
|
// we make of the product. It also makes it easy to reason about the product: there
|
|
// is 0 or 1 leading zero in the product.
|
|
|
|
// Unless the least significant 9 bits of the high (64-bit) part of the full
|
|
// product are all 1s, then we know that the most significant 55 bits are
|
|
// exact and no further work is needed. Having 55 bits is necessary because
|
|
// we need 53 bits for the mantissa but we have to have one rounding bit and
|
|
// we can waste a bit if the most significant bit of the product is zero.
|
|
if((firstproduct.high & 0x1FF) == 0x1FF) {
|
|
// We want to compute i * 5^q, but only care about the top 55 bits at most.
|
|
// Consider the scenario where q>=0. Then 5^q may not fit in 64-bits. Doing
|
|
// the full computation is wasteful. So we do what is called a "truncated
|
|
// multiplication".
|
|
// We take the most significant 64-bits, and we put them in
|
|
// power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
|
|
// to the desired approximation using one multiplication. Sometimes it does not suffice.
|
|
// Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
|
|
// then we get a better approximation to i * 5^q.
|
|
//
|
|
// That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
|
|
// more complicated.
|
|
//
|
|
// There is an extra layer of complexity in that we need more than 55 bits of
|
|
// accuracy in the round-to-even scenario.
|
|
//
|
|
// The full_multiplication function computes the 128-bit product of two 64-bit words
|
|
// with a returned value of type value128 with a "low component" corresponding to the
|
|
// 64-bit least significant bits of the product and with a "high component" corresponding
|
|
// to the 64-bit most significant bits of the product.
|
|
simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
|
|
firstproduct.low += secondproduct.high;
|
|
if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
|
|
// As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
|
|
// Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
|
|
// is sufficiently accurate, and more computation is not needed.
|
|
}
|
|
uint64_t lower = firstproduct.low;
|
|
uint64_t upper = firstproduct.high;
|
|
// The final mantissa should be 53 bits with a leading 1.
|
|
// We shift it so that it occupies 54 bits with a leading 1.
|
|
///////
|
|
uint64_t upperbit = upper >> 63;
|
|
uint64_t mantissa = upper >> (upperbit + 9);
|
|
lz += int(1 ^ upperbit);
|
|
|
|
// Here we have mantissa < (1<<54).
|
|
int64_t real_exponent = exponent - lz;
|
|
if (simdjson_unlikely(real_exponent <= 0)) { // we have a subnormal?
|
|
// Here have that real_exponent <= 0 so -real_exponent >= 0
|
|
if(-real_exponent + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
|
|
d = negative ? -0.0 : 0.0;
|
|
return true;
|
|
}
|
|
// next line is safe because -real_exponent + 1 < 0
|
|
mantissa >>= -real_exponent + 1;
|
|
// Thankfully, we can't have both "round-to-even" and subnormals because
|
|
// "round-to-even" only occurs for powers close to 0.
|
|
mantissa += (mantissa & 1); // round up
|
|
mantissa >>= 1;
|
|
// There is a weird scenario where we don't have a subnormal but just.
|
|
// Suppose we start with 2.2250738585072013e-308, we end up
|
|
// with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
|
|
// whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round
|
|
// up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer
|
|
// subnormal, but we can only know this after rounding.
|
|
// So we only declare a subnormal if we are smaller than the threshold.
|
|
real_exponent = (mantissa < (uint64_t(1) << 52)) ? 0 : 1;
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
// We have to round to even. The "to even" part
|
|
// is only a problem when we are right in between two floats
|
|
// which we guard against.
|
|
// If we have lots of trailing zeros, we may fall right between two
|
|
// floating-point values.
|
|
//
|
|
// The round-to-even cases take the form of a number 2m+1 which is in (2^53,2^54]
|
|
// times a power of two. That is, it is right between a number with binary significand
|
|
// m and another number with binary significand m+1; and it must be the case
|
|
// that it cannot be represented by a float itself.
|
|
//
|
|
// We must have that w * 10 ^q == (2m+1) * 2^p for some power of two 2^p.
|
|
// Recall that 10^q = 5^q * 2^q.
|
|
// When q >= 0, we must have that (2m+1) is divible by 5^q, so 5^q <= 2^54. We have that
|
|
// 5^23 <= 2^54 and it is the last power of five to qualify, so q <= 23.
|
|
// When q<0, we have w >= (2m+1) x 5^{-q}. We must have that w<2^{64} so
|
|
// (2m+1) x 5^{-q} < 2^{64}. We have that 2m+1>2^{53}. Hence, we must have
|
|
// 2^{53} x 5^{-q} < 2^{64}.
|
|
// Hence we have 5^{-q} < 2^{11}$ or q>= -4.
|
|
//
|
|
// We require lower <= 1 and not lower == 0 because we could not prove that
|
|
// that lower == 0 is implied; but we could prove that lower <= 1 is a necessary and sufficient test.
|
|
if (simdjson_unlikely((lower <= 1) && (power >= -4) && (power <= 23) && ((mantissa & 3) == 1))) {
|
|
if((mantissa << (upperbit + 64 - 53 - 2)) == upper) {
|
|
mantissa &= ~1; // flip it so that we do not round up
|
|
}
|
|
}
|
|
|
|
mantissa += mantissa & 1;
|
|
mantissa >>= 1;
|
|
|
|
// Here we have mantissa < (1<<53), unless there was an overflow
|
|
if (mantissa >= (1ULL << 53)) {
|
|
//////////
|
|
// This will happen when parsing values such as 7.2057594037927933e+16
|
|
////////
|
|
mantissa = (1ULL << 52);
|
|
real_exponent++;
|
|
}
|
|
mantissa &= ~(1ULL << 52);
|
|
// we have to check that real_exponent is in range, otherwise we bail out
|
|
if (simdjson_unlikely(real_exponent > 2046)) {
|
|
// We have an infinite value!!! We could actually throw an error here if we could.
|
|
return false;
|
|
}
|
|
d = to_double(mantissa, real_exponent, negative);
|
|
return true;
|
|
}
|
|
|
|
// We call a fallback floating-point parser that might be slow. Note
|
|
// it will accept JSON numbers, but the JSON spec. is more restrictive so
|
|
// before you call parse_float_fallback, you need to have validated the input
|
|
// string with the JSON grammar.
|
|
// It will return an error (false) if the parsed number is infinite.
|
|
// The string parsing itself always succeeds. We know that there is at least
|
|
// one digit.
|
|
static bool parse_float_fallback(const uint8_t *ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
static bool parse_float_fallback(const uint8_t *ptr, const uint8_t *end_ptr, double *outDouble) {
|
|
*outDouble = simdjson::internal::from_chars(reinterpret_cast<const char *>(ptr), reinterpret_cast<const char *>(end_ptr));
|
|
// We do not accept infinite values.
|
|
|
|
// Detecting finite values in a portable manner is ridiculously hard, ideally
|
|
// we would want to do:
|
|
// return !std::isfinite(*outDouble);
|
|
// but that mysteriously fails under legacy/old libc++ libraries, see
|
|
// https://github.com/simdjson/simdjson/issues/1286
|
|
//
|
|
// Therefore, fall back to this solution (the extra parens are there
|
|
// to handle that max may be a macro on windows).
|
|
return !(*outDouble > (std::numeric_limits<double>::max)() || *outDouble < std::numeric_limits<double>::lowest());
|
|
}
|
|
|
|
// check quickly whether the next 8 chars are made of digits
|
|
// at a glance, it looks better than Mula's
|
|
// http://0x80.pl/articles/swar-digits-validate.html
|
|
simdjson_inline bool is_made_of_eight_digits_fast(const uint8_t *chars) {
|
|
uint64_t val;
|
|
// this can read up to 7 bytes beyond the buffer size, but we require
|
|
// SIMDJSON_PADDING of padding
|
|
static_assert(7 <= SIMDJSON_PADDING, "SIMDJSON_PADDING must be bigger than 7");
|
|
std::memcpy(&val, chars, 8);
|
|
// a branchy method might be faster:
|
|
// return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
|
|
// && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
|
|
// 0x3030303030303030);
|
|
return (((val & 0xF0F0F0F0F0F0F0F0) |
|
|
(((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
|
|
0x3333333333333333);
|
|
}
|
|
|
|
template<typename I>
|
|
SIMDJSON_NO_SANITIZE_UNDEFINED // We deliberately allow overflow here and check later
|
|
simdjson_inline bool parse_digit(const uint8_t c, I &i) {
|
|
const uint8_t digit = static_cast<uint8_t>(c - '0');
|
|
if (digit > 9) {
|
|
return false;
|
|
}
|
|
// PERF NOTE: multiplication by 10 is cheaper than arbitrary integer multiplication
|
|
i = 10 * i + digit; // might overflow, we will handle the overflow later
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline bool is_digit(const uint8_t c) {
|
|
return static_cast<uint8_t>(c - '0') <= 9;
|
|
}
|
|
|
|
simdjson_inline error_code parse_decimal_after_separator(simdjson_unused const uint8_t *const src, const uint8_t *&p, uint64_t &i, int64_t &exponent) {
|
|
// we continue with the fiction that we have an integer. If the
|
|
// floating point number is representable as x * 10^z for some integer
|
|
// z that fits in 53 bits, then we will be able to convert back the
|
|
// the integer into a float in a lossless manner.
|
|
const uint8_t *const first_after_period = p;
|
|
|
|
#ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
#if SIMDJSON_SWAR_NUMBER_PARSING
|
|
// this helps if we have lots of decimals!
|
|
// this turns out to be frequent enough.
|
|
if (is_made_of_eight_digits_fast(p)) {
|
|
i = i * 100000000 + parse_eight_digits_unrolled(p);
|
|
p += 8;
|
|
}
|
|
#endif // SIMDJSON_SWAR_NUMBER_PARSING
|
|
#endif // #ifdef SIMDJSON_SWAR_NUMBER_PARSING
|
|
// Unrolling the first digit makes a small difference on some implementations (e.g. westmere)
|
|
if (parse_digit(*p, i)) { ++p; }
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = first_after_period - p;
|
|
// Decimal without digits (123.) is illegal
|
|
if (exponent == 0) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline error_code parse_exponent(simdjson_unused const uint8_t *const src, const uint8_t *&p, int64_t &exponent) {
|
|
// Exp Sign: -123.456e[-]78
|
|
bool neg_exp = ('-' == *p);
|
|
if (neg_exp || '+' == *p) { p++; } // Skip + as well
|
|
|
|
// Exponent: -123.456e-[78]
|
|
auto start_exp = p;
|
|
int64_t exp_number = 0;
|
|
while (parse_digit(*p, exp_number)) { ++p; }
|
|
// It is possible for parse_digit to overflow.
|
|
// In particular, it could overflow to INT64_MIN, and we cannot do - INT64_MIN.
|
|
// Thus we *must* check for possible overflow before we negate exp_number.
|
|
|
|
// Performance notes: it may seem like combining the two "simdjson_unlikely checks" below into
|
|
// a single simdjson_unlikely path would be faster. The reasoning is sound, but the compiler may
|
|
// not oblige and may, in fact, generate two distinct paths in any case. It might be
|
|
// possible to do uint64_t(p - start_exp - 1) >= 18 but it could end up trading off
|
|
// instructions for a simdjson_likely branch, an unconclusive gain.
|
|
|
|
// If there were no digits, it's an error.
|
|
if (simdjson_unlikely(p == start_exp)) {
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
// We have a valid positive exponent in exp_number at this point, except that
|
|
// it may have overflowed.
|
|
|
|
// If there were more than 18 digits, we may have overflowed the integer. We have to do
|
|
// something!!!!
|
|
if (simdjson_unlikely(p > start_exp+18)) {
|
|
// Skip leading zeroes: 1e000000000000000000001 is technically valid and does not overflow
|
|
while (*start_exp == '0') { start_exp++; }
|
|
// 19 digits could overflow int64_t and is kind of absurd anyway. We don't
|
|
// support exponents smaller than -999,999,999,999,999,999 and bigger
|
|
// than 999,999,999,999,999,999.
|
|
// We can truncate.
|
|
// Note that 999999999999999999 is assuredly too large. The maximal ieee64 value before
|
|
// infinity is ~1.8e308. The smallest subnormal is ~5e-324. So, actually, we could
|
|
// truncate at 324.
|
|
// Note that there is no reason to fail per se at this point in time.
|
|
// E.g., 0e999999999999999999999 is a fine number.
|
|
if (p > start_exp+18) { exp_number = 999999999999999999; }
|
|
}
|
|
// At this point, we know that exp_number is a sane, positive, signed integer.
|
|
// It is <= 999,999,999,999,999,999. As long as 'exponent' is in
|
|
// [-8223372036854775808, 8223372036854775808], we won't overflow. Because 'exponent'
|
|
// is bounded in magnitude by the size of the JSON input, we are fine in this universe.
|
|
// To sum it up: the next line should never overflow.
|
|
exponent += (neg_exp ? -exp_number : exp_number);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline bool check_if_integer(const uint8_t *const src, size_t max_length) {
|
|
const uint8_t *const srcend = src + max_length;
|
|
bool negative = (*src == '-'); // we can always read at least one character after the '-'
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
if(p == srcend) { return false; }
|
|
if(*p == '0') {
|
|
++p;
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
while(p != srcend && is_digit(*p)) { ++p; }
|
|
if(p == srcend) { return true; }
|
|
if(jsoncharutils::is_not_structural_or_whitespace(*p)) { return false; }
|
|
return true;
|
|
}
|
|
|
|
simdjson_inline size_t significant_digits(const uint8_t * start_digits, size_t digit_count) {
|
|
// It is possible that the integer had an overflow.
|
|
// We have to handle the case where we have 0.0000somenumber.
|
|
const uint8_t *start = start_digits;
|
|
while ((*start == '0') || (*start == '.')) { ++start; }
|
|
// we over-decrement by one when there is a '.'
|
|
return digit_count - size_t(start - start_digits);
|
|
}
|
|
|
|
} // unnamed namespace
|
|
|
|
/** @private */
|
|
static error_code slow_float_parsing(simdjson_unused const uint8_t * src, double* answer) {
|
|
if (parse_float_fallback(src, answer)) {
|
|
return SUCCESS;
|
|
}
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
|
|
/** @private */
|
|
template<typename W>
|
|
simdjson_inline error_code write_float(const uint8_t *const src, bool negative, uint64_t i, const uint8_t * start_digits, size_t digit_count, int64_t exponent, W &writer) {
|
|
// If we frequently had to deal with long strings of digits,
|
|
// we could extend our code by using a 128-bit integer instead
|
|
// of a 64-bit integer. However, this is uncommon in practice.
|
|
//
|
|
// 9999999999999999999 < 2**64 so we can accommodate 19 digits.
|
|
// If we have a decimal separator, then digit_count - 1 is the number of digits, but we
|
|
// may not have a decimal separator!
|
|
if (simdjson_unlikely(digit_count > 19 && significant_digits(start_digits, digit_count) > 19)) {
|
|
// Ok, chances are good that we had an overflow!
|
|
// this is almost never going to get called!!!
|
|
// we start anew, going slowly!!!
|
|
// This will happen in the following examples:
|
|
// 10000000000000000000000000000000000000000000e+308
|
|
// 3.1415926535897932384626433832795028841971693993751
|
|
//
|
|
// NOTE: We do not pass a reference to the to slow_float_parsing. If we passed our writer
|
|
// reference to it, it would force it to be stored in memory, preventing the compiler from
|
|
// picking it apart and putting into registers. i.e. if we pass it as reference,
|
|
// it gets slow.
|
|
double d;
|
|
error_code error = slow_float_parsing(src, &d);
|
|
writer.append_double(d);
|
|
return error;
|
|
}
|
|
// NOTE: it's weird that the simdjson_unlikely() only wraps half the if, but it seems to get slower any other
|
|
// way we've tried: https://github.com/simdjson/simdjson/pull/990#discussion_r448497331
|
|
// To future reader: we'd love if someone found a better way, or at least could explain this result!
|
|
if (simdjson_unlikely(exponent < simdjson::internal::smallest_power) || (exponent > simdjson::internal::largest_power)) {
|
|
//
|
|
// Important: smallest_power is such that it leads to a zero value.
|
|
// Observe that 18446744073709551615e-343 == 0, i.e. (2**64 - 1) e -343 is zero
|
|
// so something x 10^-343 goes to zero, but not so with something x 10^-342.
|
|
static_assert(simdjson::internal::smallest_power <= -342, "smallest_power is not small enough");
|
|
//
|
|
if((exponent < simdjson::internal::smallest_power) || (i == 0)) {
|
|
// E.g. Parse "-0.0e-999" into the same value as "-0.0". See https://en.wikipedia.org/wiki/Signed_zero
|
|
WRITE_DOUBLE(negative ? -0.0 : 0.0, src, writer);
|
|
return SUCCESS;
|
|
} else { // (exponent > largest_power) and (i != 0)
|
|
// We have, for sure, an infinite value and simdjson refuses to parse infinite values.
|
|
return INVALID_NUMBER(src);
|
|
}
|
|
}
|
|
double d;
|
|
if (!compute_float_64(exponent, i, negative, d)) {
|
|
// we are almost never going to get here.
|
|
if (!parse_float_fallback(src, &d)) { return INVALID_NUMBER(src); }
|
|
}
|
|
WRITE_DOUBLE(d, src, writer);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer);
|
|
|
|
// for performance analysis, it is sometimes useful to skip parsing
|
|
#ifdef SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const, W &writer) {
|
|
writer.append_s64(0); // always write zero
|
|
return SUCCESS; // always succeeds
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * const src) noexcept { return 0; }
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept { return false; }
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept { return number_type::signed_integer; }
|
|
#else
|
|
|
|
// parse the number at src
|
|
// define JSON_TEST_NUMBERS for unit testing
|
|
//
|
|
// It is assumed that the number is followed by a structural ({,},],[) character
|
|
// or a white space character. If that is not the case (e.g., when the JSON
|
|
// document is made of a single number), then it is necessary to copy the
|
|
// content and append a space before calling this function.
|
|
//
|
|
// Our objective is accurate parsing (ULP of 0) at high speed.
|
|
template<typename W>
|
|
simdjson_inline error_code parse_number(const uint8_t *const src, W &writer) {
|
|
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
if (digit_count == 0 || ('0' == *start_digits && digit_count > 1)) { return INVALID_NUMBER(src); }
|
|
|
|
//
|
|
// Handle floats if there is a . or e (or both)
|
|
//
|
|
int64_t exponent = 0;
|
|
bool is_float = false;
|
|
if ('.' == *p) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_decimal_after_separator(src, p, i, exponent) );
|
|
digit_count = int(p - start_digits); // used later to guard against overflows
|
|
}
|
|
if (('e' == *p) || ('E' == *p)) {
|
|
is_float = true;
|
|
++p;
|
|
SIMDJSON_TRY( parse_exponent(src, p, exponent) );
|
|
}
|
|
if (is_float) {
|
|
const bool dirty_end = jsoncharutils::is_not_structural_or_whitespace(*p);
|
|
SIMDJSON_TRY( write_float(src, negative, i, start_digits, digit_count, exponent, writer) );
|
|
if (dirty_end) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// The longest negative 64-bit number is 19 digits.
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
size_t longest_digit_count = negative ? 19 : 20;
|
|
if (digit_count > longest_digit_count) { return BIGINT_NUMBER(src); }
|
|
if (digit_count == longest_digit_count) {
|
|
if (negative) {
|
|
// Anything negative above INT64_MAX+1 is invalid
|
|
if (i > uint64_t(INT64_MAX)+1) { return BIGINT_NUMBER(src); }
|
|
WRITE_INTEGER(~i+1, src, writer);
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
} else if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INVALID_NUMBER(src); }
|
|
}
|
|
|
|
// Write unsigned if it does not fit in a signed integer.
|
|
if (i > uint64_t(INT64_MAX)) {
|
|
WRITE_UNSIGNED(i, src, writer);
|
|
} else {
|
|
WRITE_INTEGER(negative ? (~i+1) : i, src, writer);
|
|
}
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return INVALID_NUMBER(src); }
|
|
return SUCCESS;
|
|
}
|
|
|
|
// Inlineable functions
|
|
namespace {
|
|
|
|
// This table can be used to characterize the final character of an integer
|
|
// string. For JSON structural character and allowable white space characters,
|
|
// we return SUCCESS. For 'e', '.' and 'E', we return INCORRECT_TYPE. Otherwise
|
|
// we return NUMBER_ERROR.
|
|
// Optimization note: we could easily reduce the size of the table by half (to 128)
|
|
// at the cost of an extra branch.
|
|
// Optimization note: we want the values to use at most 8 bits (not, e.g., 32 bits):
|
|
static_assert(error_code(uint8_t(NUMBER_ERROR))== NUMBER_ERROR, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(SUCCESS))== SUCCESS, "bad NUMBER_ERROR cast");
|
|
static_assert(error_code(uint8_t(INCORRECT_TYPE))== INCORRECT_TYPE, "bad NUMBER_ERROR cast");
|
|
|
|
const uint8_t integer_string_finisher[256] = {
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, INCORRECT_TYPE,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, SUCCESS, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, INCORRECT_TYPE, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, SUCCESS, NUMBER_ERROR,
|
|
SUCCESS, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR, NUMBER_ERROR,
|
|
NUMBER_ERROR};
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
const uint8_t *p = src;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if ((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
if (src[0] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from 0 to 18,446,744,073,709,551,615
|
|
simdjson_unused simdjson_inline simdjson_result<uint64_t> parse_unsigned_in_string(const uint8_t * const src) noexcept {
|
|
const uint8_t *p = src + 1;
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// The longest positive 64-bit number is 20 digits.
|
|
// We do it this way so we don't trigger this branch unless we must.
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > 20))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > 20)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
if (digit_count == 20) {
|
|
// Positive overflow check:
|
|
// - A 20 digit number starting with 2-9 is overflow, because 18,446,744,073,709,551,615 is the
|
|
// biggest uint64_t.
|
|
// - A 20 digit number starting with 1 is overflow if it is less than INT64_MAX.
|
|
// If we got here, it's a 20 digit number starting with the digit "1".
|
|
// - If a 20 digit number starting with 1 overflowed (i*10+digit), the result will be smaller
|
|
// than 1,553,255,926,290,448,384.
|
|
// - That is smaller than the smallest possible 20-digit number the user could write:
|
|
// 10,000,000,000,000,000,000.
|
|
// - Therefore, if the number is positive and lower than that, it's overflow.
|
|
// - The value we are looking at is less than or equal to INT64_MAX.
|
|
//
|
|
// Note: we use src[1] and not src[0] because src[0] is the quote character in this
|
|
// instance.
|
|
if (src[1] != uint8_t('1') || i <= uint64_t(INT64_MAX)) { return INCORRECT_TYPE; }
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer(const uint8_t * const src, const uint8_t * const src_end) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
bool negative = (*src == '-');
|
|
const uint8_t *p = src + uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = p;
|
|
uint64_t i = 0;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(p - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
// return (*p == '.' || *p == 'e' || *p == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if((p != src_end) && integer_string_finisher[*p] != SUCCESS) { return error_code(integer_string_finisher[*p]); }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
// Parse any number from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
simdjson_unused simdjson_inline simdjson_result<int64_t> parse_integer_in_string(const uint8_t *src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
// PERF NOTE: we don't use is_made_of_eight_digits_fast because large integers like 123456789 are rare
|
|
const uint8_t *const start_digits = src;
|
|
uint64_t i = 0;
|
|
while (parse_digit(*src, i)) { src++; }
|
|
|
|
// If there were no digits, or if the integer starts with 0 and has more than one digit, it's an error.
|
|
// Optimization note: size_t is expected to be unsigned.
|
|
size_t digit_count = size_t(src - start_digits);
|
|
// We go from
|
|
// -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
|
|
// so we can never represent numbers that have more than 19 digits.
|
|
size_t longest_digit_count = 19;
|
|
// Optimization note: the compiler can probably merge
|
|
// ((digit_count == 0) || (digit_count > longest_digit_count))
|
|
// into a single branch since digit_count is unsigned.
|
|
if ((digit_count == 0) || (digit_count > longest_digit_count)) { return INCORRECT_TYPE; }
|
|
// Here digit_count > 0.
|
|
if (('0' == *start_digits) && (digit_count > 1)) { return NUMBER_ERROR; }
|
|
// We can do the following...
|
|
// if (!jsoncharutils::is_structural_or_whitespace(*src)) {
|
|
// return (*src == '.' || *src == 'e' || *src == 'E') ? INCORRECT_TYPE : NUMBER_ERROR;
|
|
// }
|
|
// as a single table lookup:
|
|
if(*src != '"') { return NUMBER_ERROR; }
|
|
// Negative numbers have can go down to - INT64_MAX - 1 whereas positive numbers are limited to INT64_MAX.
|
|
// Performance note: This check is only needed when digit_count == longest_digit_count but it is
|
|
// so cheap that we might as well always make it.
|
|
if(i > uint64_t(INT64_MAX) + uint64_t(negative)) { return INCORRECT_TYPE; }
|
|
return negative ? (~i+1) : i;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline bool is_negative(const uint8_t * src) noexcept {
|
|
return (*src == '-');
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<bool> is_integer(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<number_type> get_number_type(const uint8_t * src) noexcept {
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
const uint8_t *p = src;
|
|
while(static_cast<uint8_t>(*p - '0') <= 9) { p++; }
|
|
size_t digit_count = size_t(p - src);
|
|
if ( p == src ) { return NUMBER_ERROR; }
|
|
if (jsoncharutils::is_structural_or_whitespace(*p)) {
|
|
static const uint8_t * smaller_big_integer = reinterpret_cast<const uint8_t *>("9223372036854775808");
|
|
// We have an integer.
|
|
if(simdjson_unlikely(digit_count > 20)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// If the number is negative and valid, it must be a signed integer.
|
|
if(negative) {
|
|
if (simdjson_unlikely(digit_count > 19)) return number_type::big_integer;
|
|
if (simdjson_unlikely(digit_count == 19 && memcmp(src, smaller_big_integer, 19) > 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Let us check if we have a big integer (>=2**64).
|
|
static const uint8_t * two_to_sixtyfour = reinterpret_cast<const uint8_t *>("18446744073709551616");
|
|
if((digit_count > 20) || (digit_count == 20 && memcmp(src, two_to_sixtyfour, 20) >= 0)) {
|
|
return number_type::big_integer;
|
|
}
|
|
// The number is positive and smaller than 18446744073709551616 (or 2**64).
|
|
// We want values larger or equal to 9223372036854775808 to be unsigned
|
|
// integers, and the other values to be signed integers.
|
|
if((digit_count == 20) || (digit_count >= 19 && memcmp(src, smaller_big_integer, 19) >= 0)) {
|
|
return number_type::unsigned_integer;
|
|
}
|
|
return number_type::signed_integer;
|
|
}
|
|
// Hopefully, we have 'e' or 'E' or '.'.
|
|
return number_type::floating_point_number;
|
|
}
|
|
|
|
// Never read at src_end or beyond
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double(const uint8_t * src, const uint8_t * const src_end) noexcept {
|
|
if(src == src_end) { return NUMBER_ERROR; }
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*src == '-');
|
|
src += uint8_t(negative);
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely((p != src_end) && (*p == '.'))) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if ((p == src_end) || !parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while ((p != src_end) && parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = start_digits-src > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if ((p != src_end) && (*p == 'e' || *p == 'E')) {
|
|
p++;
|
|
if(p == src_end) { return NUMBER_ERROR; }
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while ((p != src_end) && parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if ((p != src_end) && jsoncharutils::is_not_structural_or_whitespace(*p)) { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), src_end, &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
simdjson_unused simdjson_inline simdjson_result<double> parse_double_in_string(const uint8_t * src) noexcept {
|
|
//
|
|
// Check for minus sign
|
|
//
|
|
bool negative = (*(src + 1) == '-');
|
|
src += uint8_t(negative) + 1;
|
|
|
|
//
|
|
// Parse the integer part.
|
|
//
|
|
uint64_t i = 0;
|
|
const uint8_t *p = src;
|
|
p += parse_digit(*p, i);
|
|
bool leading_zero = (i == 0);
|
|
while (parse_digit(*p, i)) { p++; }
|
|
// no integer digits, or 0123 (zero must be solo)
|
|
if ( p == src ) { return INCORRECT_TYPE; }
|
|
if ( (leading_zero && p != src+1)) { return NUMBER_ERROR; }
|
|
|
|
//
|
|
// Parse the decimal part.
|
|
//
|
|
int64_t exponent = 0;
|
|
bool overflow;
|
|
if (simdjson_likely(*p == '.')) {
|
|
p++;
|
|
const uint8_t *start_decimal_digits = p;
|
|
if (!parse_digit(*p, i)) { return NUMBER_ERROR; } // no decimal digits
|
|
p++;
|
|
while (parse_digit(*p, i)) { p++; }
|
|
exponent = -(p - start_decimal_digits);
|
|
|
|
// Overflow check. More than 19 digits (minus the decimal) may be overflow.
|
|
overflow = p-src-1 > 19;
|
|
if (simdjson_unlikely(overflow && leading_zero)) {
|
|
// Skip leading 0.00000 and see if it still overflows
|
|
const uint8_t *start_digits = src + 2;
|
|
while (*start_digits == '0') { start_digits++; }
|
|
overflow = p-start_digits > 19;
|
|
}
|
|
} else {
|
|
overflow = p-src > 19;
|
|
}
|
|
|
|
//
|
|
// Parse the exponent
|
|
//
|
|
if (*p == 'e' || *p == 'E') {
|
|
p++;
|
|
bool exp_neg = *p == '-';
|
|
p += exp_neg || *p == '+';
|
|
|
|
uint64_t exp = 0;
|
|
const uint8_t *start_exp_digits = p;
|
|
while (parse_digit(*p, exp)) { p++; }
|
|
// no exp digits, or 20+ exp digits
|
|
if (p-start_exp_digits == 0 || p-start_exp_digits > 19) { return NUMBER_ERROR; }
|
|
|
|
exponent += exp_neg ? 0-exp : exp;
|
|
}
|
|
|
|
if (*p != '"') { return NUMBER_ERROR; }
|
|
|
|
overflow = overflow || exponent < simdjson::internal::smallest_power || exponent > simdjson::internal::largest_power;
|
|
|
|
//
|
|
// Assemble (or slow-parse) the float
|
|
//
|
|
double d;
|
|
if (simdjson_likely(!overflow)) {
|
|
if (compute_float_64(exponent, i, negative, d)) { return d; }
|
|
}
|
|
if (!parse_float_fallback(src - uint8_t(negative), &d)) {
|
|
return NUMBER_ERROR;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
} // unnamed namespace
|
|
#endif // SIMDJSON_SKIPNUMBERPARSING
|
|
|
|
} // namespace numberparsing
|
|
|
|
inline std::ostream& operator<<(std::ostream& out, number_type type) noexcept {
|
|
switch (type) {
|
|
case number_type::signed_integer: out << "integer in [-9223372036854775808,9223372036854775808)"; break;
|
|
case number_type::unsigned_integer: out << "unsigned integer in [9223372036854775808,18446744073709551616)"; break;
|
|
case number_type::floating_point_number: out << "floating-point number (binary64)"; break;
|
|
case number_type::big_integer: out << "big integer"; break;
|
|
default: SIMDJSON_UNREACHABLE();
|
|
}
|
|
return out;
|
|
}
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_NUMBERPARSING_H
|
|
/* end file simdjson/generic/numberparsing.h for fallback */
|
|
|
|
/* including simdjson/generic/implementation_simdjson_result_base-inl.h for fallback: #include "simdjson/generic/implementation_simdjson_result_base-inl.h" */
|
|
/* begin file simdjson/generic/implementation_simdjson_result_base-inl.h for fallback */
|
|
#ifndef SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/generic/implementation_simdjson_result_base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
//
|
|
// internal::implementation_simdjson_result_base<T> inline implementation
|
|
//
|
|
|
|
template<typename T>
|
|
simdjson_inline void implementation_simdjson_result_base<T>::tie(T &value, error_code &error) && noexcept {
|
|
error = this->second;
|
|
if (!error) {
|
|
value = std::forward<implementation_simdjson_result_base<T>>(*this).first;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_warn_unused simdjson_inline error_code implementation_simdjson_result_base<T>::get(T &value) && noexcept {
|
|
error_code error;
|
|
std::forward<implementation_simdjson_result_base<T>>(*this).tie(value, error);
|
|
return error;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline error_code implementation_simdjson_result_base<T>::error() const noexcept {
|
|
return this->second;
|
|
}
|
|
|
|
#if SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value() & noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::take_value() && noexcept(false) {
|
|
if (error()) { throw simdjson_error(error()); }
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::operator T&&() && noexcept(false) {
|
|
return std::forward<implementation_simdjson_result_base<T>>(*this).take_value();
|
|
}
|
|
|
|
#endif // SIMDJSON_EXCEPTIONS
|
|
|
|
template<typename T>
|
|
simdjson_inline const T& implementation_simdjson_result_base<T>::value_unsafe() const& noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T& implementation_simdjson_result_base<T>::value_unsafe() & noexcept {
|
|
return this->first;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline T&& implementation_simdjson_result_base<T>::value_unsafe() && noexcept {
|
|
return std::forward<T>(this->first);
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value, error_code error) noexcept
|
|
: first{std::forward<T>(value)}, second{error} {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(error_code error) noexcept
|
|
: implementation_simdjson_result_base(T{}, error) {}
|
|
template<typename T>
|
|
simdjson_inline implementation_simdjson_result_base<T>::implementation_simdjson_result_base(T &&value) noexcept
|
|
: implementation_simdjson_result_base(std::forward<T>(value), SUCCESS) {}
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_GENERIC_IMPLEMENTATION_SIMDJSON_RESULT_BASE_INL_H
|
|
/* end file simdjson/generic/implementation_simdjson_result_base-inl.h for fallback */
|
|
/* end file simdjson/generic/amalgamated.h for fallback */
|
|
/* including simdjson/fallback/end.h: #include "simdjson/fallback/end.h" */
|
|
/* begin file simdjson/fallback/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "fallback" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/fallback/end.h */
|
|
|
|
#endif // SIMDJSON_FALLBACK_H
|
|
/* end file simdjson/fallback.h */
|
|
/* including simdjson/fallback/implementation.h: #include <simdjson/fallback/implementation.h> */
|
|
/* begin file simdjson/fallback/implementation.h */
|
|
#ifndef SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
|
#define SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/implementation.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
/**
|
|
* @private
|
|
*/
|
|
class implementation final : public simdjson::implementation {
|
|
public:
|
|
simdjson_inline implementation() : simdjson::implementation(
|
|
"fallback",
|
|
"Generic fallback implementation",
|
|
0
|
|
) {}
|
|
simdjson_warn_unused error_code create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_length,
|
|
std::unique_ptr<simdjson::internal::dom_parser_implementation>& dst
|
|
) const noexcept final;
|
|
simdjson_warn_unused error_code minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept final;
|
|
simdjson_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
|
|
};
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_IMPLEMENTATION_H
|
|
/* end file simdjson/fallback/implementation.h */
|
|
|
|
/* including simdjson/fallback/begin.h: #include <simdjson/fallback/begin.h> */
|
|
/* begin file simdjson/fallback/begin.h */
|
|
/* defining SIMDJSON_IMPLEMENTATION to "fallback" */
|
|
#define SIMDJSON_IMPLEMENTATION fallback
|
|
/* including simdjson/fallback/base.h: #include "simdjson/fallback/base.h" */
|
|
/* begin file simdjson/fallback/base.h */
|
|
#ifndef SIMDJSON_FALLBACK_BASE_H
|
|
#define SIMDJSON_FALLBACK_BASE_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
/**
|
|
* Fallback implementation (runs on any machine).
|
|
*/
|
|
namespace fallback {
|
|
|
|
class implementation;
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_BASE_H
|
|
/* end file simdjson/fallback/base.h */
|
|
/* including simdjson/fallback/bitmanipulation.h: #include "simdjson/fallback/bitmanipulation.h" */
|
|
/* begin file simdjson/fallback/bitmanipulation.h */
|
|
#ifndef SIMDJSON_FALLBACK_BITMANIPULATION_H
|
|
#define SIMDJSON_FALLBACK_BITMANIPULATION_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
|
|
#if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
|
|
static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
|
|
unsigned long x0 = (unsigned long)x, top, bottom;
|
|
_BitScanForward(&top, (unsigned long)(x >> 32));
|
|
_BitScanForward(&bottom, x0);
|
|
*ret = x0 ? bottom : 32 + top;
|
|
return x != 0;
|
|
}
|
|
static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
|
|
unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
|
|
_BitScanReverse(&top, x1);
|
|
_BitScanReverse(&bottom, (unsigned long)x);
|
|
*ret = x1 ? top + 32 : bottom;
|
|
return x != 0;
|
|
}
|
|
#endif
|
|
|
|
/* result might be undefined when input_num is zero */
|
|
simdjson_inline int leading_zeroes(uint64_t input_num) {
|
|
#ifdef _MSC_VER
|
|
unsigned long leading_zero = 0;
|
|
// Search the mask data from most significant bit (MSB)
|
|
// to least significant bit (LSB) for a set bit (1).
|
|
if (_BitScanReverse64(&leading_zero, input_num))
|
|
return (int)(63 - leading_zero);
|
|
else
|
|
return 64;
|
|
#else
|
|
return __builtin_clzll(input_num);
|
|
#endif// _MSC_VER
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_BITMANIPULATION_H
|
|
/* end file simdjson/fallback/bitmanipulation.h */
|
|
/* including simdjson/fallback/stringparsing_defs.h: #include "simdjson/fallback/stringparsing_defs.h" */
|
|
/* begin file simdjson/fallback/stringparsing_defs.h */
|
|
#ifndef SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H
|
|
#define SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
|
|
// Holds backslashes and quotes locations.
|
|
struct backslash_and_quote {
|
|
public:
|
|
static constexpr uint32_t BYTES_PROCESSED = 1;
|
|
simdjson_inline static backslash_and_quote copy_and_find(const uint8_t *src, uint8_t *dst);
|
|
|
|
simdjson_inline bool has_quote_first() { return c == '"'; }
|
|
simdjson_inline bool has_backslash() { return c == '\\'; }
|
|
simdjson_inline int quote_index() { return c == '"' ? 0 : 1; }
|
|
simdjson_inline int backslash_index() { return c == '\\' ? 0 : 1; }
|
|
|
|
uint8_t c;
|
|
}; // struct backslash_and_quote
|
|
|
|
simdjson_inline backslash_and_quote backslash_and_quote::copy_and_find(const uint8_t *src, uint8_t *dst) {
|
|
// store to dest unconditionally - we can overwrite the bits we don't like later
|
|
dst[0] = src[0];
|
|
return { src[0] };
|
|
}
|
|
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_FALLBACK_STRINGPARSING_DEFS_H
|
|
/* end file simdjson/fallback/stringparsing_defs.h */
|
|
/* including simdjson/fallback/numberparsing_defs.h: #include "simdjson/fallback/numberparsing_defs.h" */
|
|
/* begin file simdjson/fallback/numberparsing_defs.h */
|
|
#ifndef SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H
|
|
#define SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/internal/numberparsing_tables.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
#ifdef JSON_TEST_NUMBERS // for unit testing
|
|
void found_invalid_number(const uint8_t *buf);
|
|
void found_integer(int64_t result, const uint8_t *buf);
|
|
void found_unsigned_integer(uint64_t result, const uint8_t *buf);
|
|
void found_float(double result, const uint8_t *buf);
|
|
#endif
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace numberparsing {
|
|
|
|
// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const char *chars) {
|
|
uint64_t val;
|
|
memcpy(&val, chars, sizeof(uint64_t));
|
|
val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
|
|
val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
|
|
return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
|
|
}
|
|
|
|
/** @private */
|
|
static simdjson_inline uint32_t parse_eight_digits_unrolled(const uint8_t *chars) {
|
|
return parse_eight_digits_unrolled(reinterpret_cast<const char *>(chars));
|
|
}
|
|
|
|
#if SIMDJSON_IS_32BITS // _umul128 for x86, arm
|
|
// this is a slow emulation routine for 32-bit
|
|
//
|
|
static simdjson_inline uint64_t __emulu(uint32_t x, uint32_t y) {
|
|
return x * (uint64_t)y;
|
|
}
|
|
static simdjson_inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
|
|
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
|
|
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
|
|
uint64_t adbc = ad + __emulu((uint32_t)ab, (uint32_t)(cd >> 32));
|
|
uint64_t adbc_carry = !!(adbc < ad);
|
|
uint64_t lo = bd + (adbc << 32);
|
|
*hi = __emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
|
|
(adbc_carry << 32) + !!(lo < bd);
|
|
return lo;
|
|
}
|
|
#endif
|
|
|
|
/** @private */
|
|
simdjson_inline internal::value128 full_multiplication(uint64_t value1, uint64_t value2) {
|
|
internal::value128 answer;
|
|
#if SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
#if SIMDJSON_IS_ARM64
|
|
// ARM64 has native support for 64-bit multiplications, no need to emultate
|
|
answer.high = __umulh(value1, value2);
|
|
answer.low = value1 * value2;
|
|
#else
|
|
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
|
|
#endif // SIMDJSON_IS_ARM64
|
|
#else // SIMDJSON_REGULAR_VISUAL_STUDIO || SIMDJSON_IS_32BITS
|
|
__uint128_t r = (static_cast<__uint128_t>(value1)) * value2;
|
|
answer.low = uint64_t(r);
|
|
answer.high = uint64_t(r >> 64);
|
|
#endif
|
|
return answer;
|
|
}
|
|
|
|
} // namespace numberparsing
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#define SIMDJSON_SWAR_NUMBER_PARSING 1
|
|
|
|
#endif // SIMDJSON_FALLBACK_NUMBERPARSING_DEFS_H
|
|
/* end file simdjson/fallback/numberparsing_defs.h */
|
|
/* end file simdjson/fallback/begin.h */
|
|
/* including generic/stage1/find_next_document_index.h for fallback: #include <generic/stage1/find_next_document_index.h> */
|
|
/* begin file generic/stage1/find_next_document_index.h for fallback */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
/**
|
|
* This algorithm is used to quickly identify the last structural position that
|
|
* makes up a complete document.
|
|
*
|
|
* It does this by going backwards and finding the last *document boundary* (a
|
|
* place where one value follows another without a comma between them). If the
|
|
* last document (the characters after the boundary) has an equal number of
|
|
* start and end brackets, it is considered complete.
|
|
*
|
|
* Simply put, we iterate over the structural characters, starting from
|
|
* the end. We consider that we found the end of a JSON document when the
|
|
* first element of the pair is NOT one of these characters: '{' '[' ':' ','
|
|
* and when the second element is NOT one of these characters: '}' ']' ':' ','.
|
|
*
|
|
* This simple comparison works most of the time, but it does not cover cases
|
|
* where the batch's structural indexes contain a perfect amount of documents.
|
|
* In such a case, we do not have access to the structural index which follows
|
|
* the last document, therefore, we do not have access to the second element in
|
|
* the pair, and that means we cannot identify the last document. To fix this
|
|
* issue, we keep a count of the open and closed curly/square braces we found
|
|
* while searching for the pair. When we find a pair AND the count of open and
|
|
* closed curly/square braces is the same, we know that we just passed a
|
|
* complete document, therefore the last json buffer location is the end of the
|
|
* batch.
|
|
*/
|
|
simdjson_inline uint32_t find_next_document_index(dom_parser_implementation &parser) {
|
|
// Variant: do not count separately, just figure out depth
|
|
if(parser.n_structural_indexes == 0) { return 0; }
|
|
auto arr_cnt = 0;
|
|
auto obj_cnt = 0;
|
|
for (auto i = parser.n_structural_indexes - 1; i > 0; i--) {
|
|
auto idxb = parser.structural_indexes[i];
|
|
switch (parser.buf[idxb]) {
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
case '}':
|
|
obj_cnt--;
|
|
continue;
|
|
case ']':
|
|
arr_cnt--;
|
|
continue;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
auto idxa = parser.structural_indexes[i - 1];
|
|
switch (parser.buf[idxa]) {
|
|
case '{':
|
|
case '[':
|
|
case ':':
|
|
case ',':
|
|
continue;
|
|
}
|
|
// Last document is complete, so the next document will appear after!
|
|
if (!arr_cnt && !obj_cnt) {
|
|
return parser.n_structural_indexes;
|
|
}
|
|
// Last document is incomplete; mark the document at i + 1 as the next one
|
|
return i;
|
|
}
|
|
// If we made it to the end, we want to finish counting to see if we have a full document.
|
|
switch (parser.buf[parser.structural_indexes[0]]) {
|
|
case '}':
|
|
obj_cnt--;
|
|
break;
|
|
case ']':
|
|
arr_cnt--;
|
|
break;
|
|
case '{':
|
|
obj_cnt++;
|
|
break;
|
|
case '[':
|
|
arr_cnt++;
|
|
break;
|
|
}
|
|
if (!arr_cnt && !obj_cnt) {
|
|
// We have a complete document.
|
|
return parser.n_structural_indexes;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE1_FIND_NEXT_DOCUMENT_INDEX_H
|
|
/* end file generic/stage1/find_next_document_index.h for fallback */
|
|
/* including generic/stage2/stringparsing.h for fallback: #include <generic/stage2/stringparsing.h> */
|
|
/* begin file generic/stage2/stringparsing.h for fallback */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/jsoncharutils.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
// This file contains the common code every implementation uses
|
|
// It is intended to be included multiple times and compiled multiple times
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
/// @private
|
|
namespace stringparsing {
|
|
|
|
// begin copypasta
|
|
// These chars yield themselves: " \ /
|
|
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
|
|
// u not handled in this table as it's complex
|
|
static const uint8_t escape_map[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x2f,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x4.
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
|
|
0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
|
|
0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
};
|
|
|
|
// handle a unicode codepoint
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr, bool allow_replacement) {
|
|
// Use the default Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD)
|
|
constexpr uint32_t substitution_code_point = 0xfffd;
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) != ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
|
|
// We have already checked that the high surrogate is valid and
|
|
// (code_point - 0xd800) < 1024.
|
|
//
|
|
// Check that code_point_2 is in the range 0xdc00..0xdfff
|
|
// and that code_point_2 was parsed from valid hex.
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if (low_bit >> 10) {
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
} else {
|
|
code_point = (((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
|
|
}
|
|
} else if (code_point >= 0xdc00 && code_point <= 0xdfff) {
|
|
// If we encounter a low surrogate (not preceded by a high surrogate)
|
|
// then we have an error.
|
|
if(!allow_replacement) { return false; }
|
|
code_point = substitution_code_point;
|
|
}
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
// handle a unicode codepoint using the wobbly convention
|
|
// https://simonsapin.github.io/wtf-8/
|
|
// write appropriate values into dest
|
|
// src will advance 6 bytes or 12 bytes
|
|
// dest will advance a variable amount (return via pointer)
|
|
// return true if the unicode codepoint was valid
|
|
// We work in little-endian then swap at write time
|
|
simdjson_warn_unused
|
|
simdjson_inline bool handle_unicode_codepoint_wobbly(const uint8_t **src_ptr,
|
|
uint8_t **dst_ptr) {
|
|
// It is not ideal that this function is nearly identical to handle_unicode_codepoint.
|
|
//
|
|
// jsoncharutils::hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
|
|
// conversion is not valid; we defer the check for this to inside the
|
|
// multilingual plane check.
|
|
uint32_t code_point = jsoncharutils::hex_to_u32_nocheck(*src_ptr + 2);
|
|
*src_ptr += 6;
|
|
// If we found a high surrogate, we must
|
|
// check for low surrogate for characters
|
|
// outside the Basic
|
|
// Multilingual Plane.
|
|
if (code_point >= 0xd800 && code_point < 0xdc00) {
|
|
const uint8_t *src_data = *src_ptr;
|
|
/* Compiler optimizations convert this to a single 16-bit load and compare on most platforms */
|
|
if (((src_data[0] << 8) | src_data[1]) == ((static_cast<uint8_t> ('\\') << 8) | static_cast<uint8_t> ('u'))) {
|
|
uint32_t code_point_2 = jsoncharutils::hex_to_u32_nocheck(src_data + 2);
|
|
uint32_t low_bit = code_point_2 - 0xdc00;
|
|
if ((low_bit >> 10) == 0) {
|
|
code_point =
|
|
(((code_point - 0xd800) << 10) | low_bit) + 0x10000;
|
|
*src_ptr += 6;
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t offset = jsoncharutils::codepoint_to_utf8(code_point, *dst_ptr);
|
|
*dst_ptr += offset;
|
|
return offset > 0;
|
|
}
|
|
|
|
|
|
/**
|
|
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
|
|
* must be an unescaped quote terminating the string. It returns the final output
|
|
* position as pointer. In case of error (e.g., the string has bad escaped codes),
|
|
* then null_nullptrptr is returned. It is assumed that the output buffer is large
|
|
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
|
|
* SIMDJSON_PADDING bytes.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) {
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint(&src, &dst, allow_replacement)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) {
|
|
// It is not ideal that this function is nearly identical to parse_string.
|
|
while (1) {
|
|
// Copy the next n bytes, and find the backslash and quote in them.
|
|
auto bs_quote = backslash_and_quote::copy_and_find(src, dst);
|
|
// If the next thing is the end quote, copy and return
|
|
if (bs_quote.has_quote_first()) {
|
|
// we encountered quotes first. Move dst to point to quotes and exit
|
|
return dst + bs_quote.quote_index();
|
|
}
|
|
if (bs_quote.has_backslash()) {
|
|
/* find out where the backspace is */
|
|
auto bs_dist = bs_quote.backslash_index();
|
|
uint8_t escape_char = src[bs_dist + 1];
|
|
/* we encountered backslash first. Handle backslash */
|
|
if (escape_char == 'u') {
|
|
/* move src/dst up to the start; they will be further adjusted
|
|
within the unicode codepoint handling code. */
|
|
src += bs_dist;
|
|
dst += bs_dist;
|
|
if (!handle_unicode_codepoint_wobbly(&src, &dst)) {
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
/* simple 1:1 conversion. Will eat bs_dist+2 characters in input and
|
|
* write bs_dist+1 characters to output
|
|
* note this may reach beyond the part of the buffer we've actually
|
|
* seen. I think this is ok */
|
|
uint8_t escape_result = escape_map[escape_char];
|
|
if (escape_result == 0u) {
|
|
return nullptr; /* bogus escape value is an error */
|
|
}
|
|
dst[bs_dist] = escape_result;
|
|
src += bs_dist + 2;
|
|
dst += bs_dist + 1;
|
|
}
|
|
} else {
|
|
/* they are the same. Since they can't co-occur, it means we
|
|
* encountered neither. */
|
|
src += backslash_and_quote::BYTES_PROCESSED;
|
|
dst += backslash_and_quote::BYTES_PROCESSED;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace stringparsing
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_STRINGPARSING_H
|
|
/* end file generic/stage2/stringparsing.h for fallback */
|
|
/* including generic/stage2/logger.h for fallback: #include <generic/stage2/logger.h> */
|
|
/* begin file generic/stage2/logger.h for fallback */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
|
|
// This is for an internal-only stage 2 specific logger.
|
|
// Set LOG_ENABLED = true to log what stage 2 is doing!
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
namespace logger {
|
|
|
|
static constexpr const char * DASHES = "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
|
|
|
|
#if SIMDJSON_VERBOSE_LOGGING
|
|
static constexpr const bool LOG_ENABLED = true;
|
|
#else
|
|
static constexpr const bool LOG_ENABLED = false;
|
|
#endif
|
|
static constexpr const int LOG_EVENT_LEN = 20;
|
|
static constexpr const int LOG_BUFFER_LEN = 30;
|
|
static constexpr const int LOG_SMALL_BUFFER_LEN = 10;
|
|
static constexpr const int LOG_INDEX_LEN = 5;
|
|
|
|
static int log_depth; // Not threadsafe. Log only.
|
|
|
|
// Helper to turn unprintable or newline characters into spaces
|
|
static simdjson_inline char printable_char(char c) {
|
|
if (c >= 0x20) {
|
|
return c;
|
|
} else {
|
|
return ' ';
|
|
}
|
|
}
|
|
|
|
// Print the header and set up log_start
|
|
static simdjson_inline void log_start() {
|
|
if (LOG_ENABLED) {
|
|
log_depth = 0;
|
|
printf("\n");
|
|
printf("| %-*s | %-*s | %-*s | %-*s | Detail |\n", LOG_EVENT_LEN, "Event", LOG_BUFFER_LEN, "Buffer", LOG_SMALL_BUFFER_LEN, "Next", 5, "Next#");
|
|
printf("|%.*s|%.*s|%.*s|%.*s|--------|\n", LOG_EVENT_LEN+2, DASHES, LOG_BUFFER_LEN+2, DASHES, LOG_SMALL_BUFFER_LEN+2, DASHES, 5+2, DASHES);
|
|
}
|
|
}
|
|
|
|
simdjson_unused static simdjson_inline void log_string(const char *message) {
|
|
if (LOG_ENABLED) {
|
|
printf("%s\n", message);
|
|
}
|
|
}
|
|
|
|
// Logs a single line from the stage 2 DOM parser
|
|
template<typename S>
|
|
static simdjson_inline void log_line(S &structurals, const char *title_prefix, const char *title, const char *detail) {
|
|
if (LOG_ENABLED) {
|
|
printf("| %*s%s%-*s ", log_depth*2, "", title_prefix, LOG_EVENT_LEN - log_depth*2 - int(strlen(title_prefix)), title);
|
|
auto current_index = structurals.at_beginning() ? nullptr : structurals.next_structural-1;
|
|
auto next_index = structurals.next_structural;
|
|
auto current = current_index ? &structurals.buf[*current_index] : reinterpret_cast<const uint8_t*>(" ");
|
|
auto next = &structurals.buf[*next_index];
|
|
{
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(current[i]));
|
|
}
|
|
printf(" ");
|
|
// Print the next N characters in the buffer.
|
|
printf("| ");
|
|
// Otherwise, print the characters starting from the buffer position.
|
|
// Print spaces for unprintable or newline characters.
|
|
for (int i=0;i<LOG_SMALL_BUFFER_LEN;i++) {
|
|
printf("%c", printable_char(next[i]));
|
|
}
|
|
printf(" ");
|
|
}
|
|
if (current_index) {
|
|
printf("| %*u ", LOG_INDEX_LEN, *current_index);
|
|
} else {
|
|
printf("| %-*s ", LOG_INDEX_LEN, "");
|
|
}
|
|
// printf("| %*u ", LOG_INDEX_LEN, structurals.next_tape_index());
|
|
printf("| %-s ", detail);
|
|
printf("|\n");
|
|
}
|
|
}
|
|
|
|
} // namespace logger
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_LOGGER_H
|
|
/* end file generic/stage2/logger.h for fallback */
|
|
/* including generic/stage2/json_iterator.h for fallback: #include <generic/stage2/json_iterator.h> */
|
|
/* begin file generic/stage2/json_iterator.h for fallback */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/logger.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
class json_iterator {
|
|
public:
|
|
const uint8_t* const buf;
|
|
uint32_t *next_structural;
|
|
dom_parser_implementation &dom_parser;
|
|
uint32_t depth{0};
|
|
|
|
/**
|
|
* Walk the JSON document.
|
|
*
|
|
* The visitor receives callbacks when values are encountered. All callbacks pass the iterator as
|
|
* the first parameter; some callbacks have other parameters as well:
|
|
*
|
|
* - visit_document_start() - at the beginning.
|
|
* - visit_document_end() - at the end (if things were successful).
|
|
*
|
|
* - visit_array_start() - at the start `[` of a non-empty array.
|
|
* - visit_array_end() - at the end `]` of a non-empty array.
|
|
* - visit_empty_array() - when an empty array is encountered.
|
|
*
|
|
* - visit_object_end() - at the start `]` of a non-empty object.
|
|
* - visit_object_start() - at the end `]` of a non-empty object.
|
|
* - visit_empty_object() - when an empty object is encountered.
|
|
* - visit_key(const uint8_t *key) - when a key in an object field is encountered. key is
|
|
* guaranteed to point at the first quote of the string (`"key"`).
|
|
* - visit_primitive(const uint8_t *value) - when a value is a string, number, boolean or null.
|
|
* - visit_root_primitive(iter, uint8_t *value) - when the top-level value is a string, number, boolean or null.
|
|
*
|
|
* - increment_count(iter) - each time a value is found in an array or object.
|
|
*/
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code walk_document(V &visitor) noexcept;
|
|
|
|
/**
|
|
* Create an iterator capable of walking a JSON document.
|
|
*
|
|
* The document must have already passed through stage 1.
|
|
*/
|
|
simdjson_inline json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index);
|
|
|
|
/**
|
|
* Look at the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *peek() const noexcept;
|
|
/**
|
|
* Advance to the next token.
|
|
*
|
|
* Tokens can be strings, numbers, booleans, null, or operators (`[{]},:`)).
|
|
*
|
|
* They may include invalid JSON as well (such as `1.2.3` or `ture`).
|
|
*/
|
|
simdjson_inline const uint8_t *advance() noexcept;
|
|
/**
|
|
* Get the remaining length of the document, from the start of the current token.
|
|
*/
|
|
simdjson_inline size_t remaining_len() const noexcept;
|
|
/**
|
|
* Check if we are at the end of the document.
|
|
*
|
|
* If this is true, there are no more tokens.
|
|
*/
|
|
simdjson_inline bool at_eof() const noexcept;
|
|
/**
|
|
* Check if we are at the beginning of the document.
|
|
*/
|
|
simdjson_inline bool at_beginning() const noexcept;
|
|
simdjson_inline uint8_t last_structural() const noexcept;
|
|
|
|
/**
|
|
* Log that a value has been found.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the start of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_start_value(const char *type) const noexcept;
|
|
/**
|
|
* Log the end of a multipart value.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_end_value(const char *type) const noexcept;
|
|
/**
|
|
* Log an error.
|
|
*
|
|
* Set LOG_ENABLED=true in logger.h to see logging.
|
|
*/
|
|
simdjson_inline void log_error(const char *error) const noexcept;
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(V &visitor, const uint8_t *value) noexcept;
|
|
};
|
|
|
|
template<bool STREAMING, typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::walk_document(V &visitor) noexcept {
|
|
logger::log_start();
|
|
|
|
//
|
|
// Start the document
|
|
//
|
|
if (at_eof()) { return EMPTY; }
|
|
log_start_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_start(*this) );
|
|
|
|
//
|
|
// Read first value
|
|
//
|
|
{
|
|
auto value = advance();
|
|
|
|
// Make sure the outer object or array is closed before continuing; otherwise, there are ways we
|
|
// could get into memory corruption. See https://github.com/simdjson/simdjson/issues/906
|
|
if (!STREAMING) {
|
|
switch (*value) {
|
|
case '{': if (last_structural() != '}') { log_value("starting brace unmatched"); return TAPE_ERROR; }; break;
|
|
case '[': if (last_structural() != ']') { log_value("starting bracket unmatched"); return TAPE_ERROR; }; break;
|
|
}
|
|
}
|
|
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_root_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
goto document_end;
|
|
|
|
//
|
|
// Object parser states
|
|
//
|
|
object_begin:
|
|
log_start_value("object");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = false;
|
|
SIMDJSON_TRY( visitor.visit_object_start(*this) );
|
|
|
|
{
|
|
auto key = advance();
|
|
if (*key != '"') { log_error("Object does not start with a key"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
|
|
object_field:
|
|
if (simdjson_unlikely( *advance() != ':' )) { log_error("Missing colon after key in object"); return TAPE_ERROR; }
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
object_continue:
|
|
switch (*advance()) {
|
|
case ',':
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
{
|
|
auto key = advance();
|
|
if (simdjson_unlikely( *key != '"' )) { log_error("Key string missing at beginning of field in object"); return TAPE_ERROR; }
|
|
SIMDJSON_TRY( visitor.visit_key(*this, key) );
|
|
}
|
|
goto object_field;
|
|
case '}': log_end_value("object"); SIMDJSON_TRY( visitor.visit_object_end(*this) ); goto scope_end;
|
|
default: log_error("No comma between object fields"); return TAPE_ERROR;
|
|
}
|
|
|
|
scope_end:
|
|
depth--;
|
|
if (depth == 0) { goto document_end; }
|
|
if (dom_parser.is_array[depth]) { goto array_continue; }
|
|
goto object_continue;
|
|
|
|
//
|
|
// Array parser states
|
|
//
|
|
array_begin:
|
|
log_start_value("array");
|
|
depth++;
|
|
if (depth >= dom_parser.max_depth()) { log_error("Exceeded max depth!"); return DEPTH_ERROR; }
|
|
dom_parser.is_array[depth] = true;
|
|
SIMDJSON_TRY( visitor.visit_array_start(*this) );
|
|
SIMDJSON_TRY( visitor.increment_count(*this) );
|
|
|
|
array_value:
|
|
{
|
|
auto value = advance();
|
|
switch (*value) {
|
|
case '{': if (*peek() == '}') { advance(); log_value("empty object"); SIMDJSON_TRY( visitor.visit_empty_object(*this) ); break; } goto object_begin;
|
|
case '[': if (*peek() == ']') { advance(); log_value("empty array"); SIMDJSON_TRY( visitor.visit_empty_array(*this) ); break; } goto array_begin;
|
|
default: SIMDJSON_TRY( visitor.visit_primitive(*this, value) ); break;
|
|
}
|
|
}
|
|
|
|
array_continue:
|
|
switch (*advance()) {
|
|
case ',': SIMDJSON_TRY( visitor.increment_count(*this) ); goto array_value;
|
|
case ']': log_end_value("array"); SIMDJSON_TRY( visitor.visit_array_end(*this) ); goto scope_end;
|
|
default: log_error("Missing comma between array values"); return TAPE_ERROR;
|
|
}
|
|
|
|
document_end:
|
|
log_end_value("document");
|
|
SIMDJSON_TRY( visitor.visit_document_end(*this) );
|
|
|
|
dom_parser.next_structural_index = uint32_t(next_structural - &dom_parser.structural_indexes[0]);
|
|
|
|
// If we didn't make it to the end, it's an error
|
|
if ( !STREAMING && dom_parser.next_structural_index != dom_parser.n_structural_indexes ) {
|
|
log_error("More than one JSON value at the root of the document, or extra characters at the end of the JSON!");
|
|
return TAPE_ERROR;
|
|
}
|
|
|
|
return SUCCESS;
|
|
|
|
} // walk_document()
|
|
|
|
simdjson_inline json_iterator::json_iterator(dom_parser_implementation &_dom_parser, size_t start_structural_index)
|
|
: buf{_dom_parser.buf},
|
|
next_structural{&_dom_parser.structural_indexes[start_structural_index]},
|
|
dom_parser{_dom_parser} {
|
|
}
|
|
|
|
simdjson_inline const uint8_t *json_iterator::peek() const noexcept {
|
|
return &buf[*(next_structural)];
|
|
}
|
|
simdjson_inline const uint8_t *json_iterator::advance() noexcept {
|
|
return &buf[*(next_structural++)];
|
|
}
|
|
simdjson_inline size_t json_iterator::remaining_len() const noexcept {
|
|
return dom_parser.len - *(next_structural-1);
|
|
}
|
|
|
|
simdjson_inline bool json_iterator::at_eof() const noexcept {
|
|
return next_structural == &dom_parser.structural_indexes[dom_parser.n_structural_indexes];
|
|
}
|
|
simdjson_inline bool json_iterator::at_beginning() const noexcept {
|
|
return next_structural == dom_parser.structural_indexes.get();
|
|
}
|
|
simdjson_inline uint8_t json_iterator::last_structural() const noexcept {
|
|
return buf[dom_parser.structural_indexes[dom_parser.n_structural_indexes - 1]];
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_start_value(const char *type) const noexcept {
|
|
logger::log_line(*this, "+", type, "");
|
|
if (logger::LOG_ENABLED) { logger::log_depth++; }
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_end_value(const char *type) const noexcept {
|
|
if (logger::LOG_ENABLED) { logger::log_depth--; }
|
|
logger::log_line(*this, "-", type, "");
|
|
}
|
|
|
|
simdjson_inline void json_iterator::log_error(const char *error) const noexcept {
|
|
logger::log_line(*this, "", "ERROR", error);
|
|
}
|
|
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
switch (*value) {
|
|
case '"': return visitor.visit_root_string(*this, value);
|
|
case 't': return visitor.visit_root_true_atom(*this, value);
|
|
case 'f': return visitor.visit_root_false_atom(*this, value);
|
|
case 'n': return visitor.visit_root_null_atom(*this, value);
|
|
case '-':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return visitor.visit_root_number(*this, value);
|
|
default:
|
|
log_error("Document starts with a non-value character");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
template<typename V>
|
|
simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
|
|
// Use the fact that most scalars are going to be either strings or numbers.
|
|
if(*value == '"') {
|
|
return visitor.visit_string(*this, value);
|
|
} else if (((*value - '0') < 10) || (*value == '-')) {
|
|
return visitor.visit_number(*this, value);
|
|
}
|
|
// true, false, null are uncommon.
|
|
switch (*value) {
|
|
case 't': return visitor.visit_true_atom(*this, value);
|
|
case 'f': return visitor.visit_false_atom(*this, value);
|
|
case 'n': return visitor.visit_null_atom(*this, value);
|
|
default:
|
|
log_error("Non-value found when value was expected!");
|
|
return TAPE_ERROR;
|
|
}
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_JSON_ITERATOR_H
|
|
/* end file generic/stage2/json_iterator.h for fallback */
|
|
/* including generic/stage2/tape_writer.h for fallback: #include <generic/stage2/tape_writer.h> */
|
|
/* begin file generic/stage2/tape_writer.h for fallback */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/internal/tape_type.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
#include <cstring>
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_writer {
|
|
/** The next place to write to tape */
|
|
uint64_t *next_tape_loc;
|
|
|
|
/** Write a signed 64-bit value to tape. */
|
|
simdjson_inline void append_s64(int64_t value) noexcept;
|
|
|
|
/** Write an unsigned 64-bit value to tape. */
|
|
simdjson_inline void append_u64(uint64_t value) noexcept;
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void append_double(double value) noexcept;
|
|
|
|
/**
|
|
* Append a tape entry (an 8-bit type,and 56 bits worth of value).
|
|
*/
|
|
simdjson_inline void append(uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
/**
|
|
* Skip the current tape entry without writing.
|
|
*
|
|
* Used to skip the start of the container, since we'll come back later to fill it in when the
|
|
* container ends.
|
|
*/
|
|
simdjson_inline void skip() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a large u64 or i64.
|
|
*/
|
|
simdjson_inline void skip_large_integer() noexcept;
|
|
|
|
/**
|
|
* Skip the number of tape entries necessary to write a double.
|
|
*/
|
|
simdjson_inline void skip_double() noexcept;
|
|
|
|
/**
|
|
* Write a value to a known location on tape.
|
|
*
|
|
* Used to go back and write out the start of a container after the container ends.
|
|
*/
|
|
simdjson_inline static void write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept;
|
|
|
|
private:
|
|
/**
|
|
* Append both the tape entry, and a supplementary value following it. Used for types that need
|
|
* all 64 bits, such as double and uint64_t.
|
|
*/
|
|
template<typename T>
|
|
simdjson_inline void append2(uint64_t val, T val2, internal::tape_type t) noexcept;
|
|
}; // struct tape_writer
|
|
|
|
simdjson_inline void tape_writer::append_s64(int64_t value) noexcept {
|
|
append2(0, value, internal::tape_type::INT64);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append_u64(uint64_t value) noexcept {
|
|
append(0, internal::tape_type::UINT64);
|
|
*next_tape_loc = value;
|
|
next_tape_loc++;
|
|
}
|
|
|
|
/** Write a double value to tape. */
|
|
simdjson_inline void tape_writer::append_double(double value) noexcept {
|
|
append2(0, value, internal::tape_type::DOUBLE);
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip() noexcept {
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_large_integer() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::skip_double() noexcept {
|
|
next_tape_loc += 2;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::append(uint64_t val, internal::tape_type t) noexcept {
|
|
*next_tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
next_tape_loc++;
|
|
}
|
|
|
|
template<typename T>
|
|
simdjson_inline void tape_writer::append2(uint64_t val, T val2, internal::tape_type t) noexcept {
|
|
append(val, t);
|
|
static_assert(sizeof(val2) == sizeof(*next_tape_loc), "Type is not 64 bits!");
|
|
memcpy(next_tape_loc, &val2, sizeof(val2));
|
|
next_tape_loc++;
|
|
}
|
|
|
|
simdjson_inline void tape_writer::write(uint64_t &tape_loc, uint64_t val, internal::tape_type t) noexcept {
|
|
tape_loc = val | ((uint64_t(char(t))) << 56);
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_WRITER_H
|
|
/* end file generic/stage2/tape_writer.h for fallback */
|
|
/* including generic/stage2/tape_builder.h for fallback: #include <generic/stage2/tape_builder.h> */
|
|
/* begin file generic/stage2/tape_builder.h for fallback */
|
|
#ifndef SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/base.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/json_iterator.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/stringparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <generic/stage2/tape_writer.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/dom/document.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/atomparsing.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/dom_parser_implementation.h> */
|
|
/* amalgamation skipped (editor-only): #include <simdjson/generic/numberparsing.h> */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
namespace {
|
|
namespace stage2 {
|
|
|
|
struct tape_builder {
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused static simdjson_inline error_code parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept;
|
|
|
|
/** Called when a non-empty document starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty document ends without error. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_document_end(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty array starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_start(json_iterator &iter) noexcept;
|
|
/** Called when a non-empty array ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_array_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty array is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_array(json_iterator &iter) noexcept;
|
|
|
|
/** Called when a non-empty object starts. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_start(json_iterator &iter) noexcept;
|
|
/**
|
|
* Called when a key in a field is encountered.
|
|
*
|
|
* primitive, visit_object_start, visit_empty_object, visit_array_start, or visit_empty_array
|
|
* will be called after this with the field value.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_key(json_iterator &iter, const uint8_t *key) noexcept;
|
|
/** Called when a non-empty object ends. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_object_end(json_iterator &iter) noexcept;
|
|
/** Called when an empty object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code visit_empty_object(json_iterator &iter) noexcept;
|
|
|
|
/**
|
|
* Called when a string, number, boolean or null is found.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
/**
|
|
* Called when a string, number, boolean or null is found at the top level of a document (i.e.
|
|
* when there is no array or object and the entire document is a single string, number, boolean or
|
|
* null.
|
|
*
|
|
* This is separate from primitive() because simdjson's normal primitive parsing routines assume
|
|
* there is at least one more token after the value, which is only true in an array or object.
|
|
*/
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_string(json_iterator &iter, const uint8_t *value, bool key = false) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_string(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_number(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept;
|
|
|
|
/** Called each time a new field or element in an array or object is found. */
|
|
simdjson_warn_unused simdjson_inline error_code increment_count(json_iterator &iter) noexcept;
|
|
|
|
/** Next location to write to tape */
|
|
tape_writer tape;
|
|
private:
|
|
/** Next write location in the string buf for stage 2 parsing */
|
|
uint8_t *current_string_buf_loc;
|
|
|
|
simdjson_inline tape_builder(dom::document &doc) noexcept;
|
|
|
|
simdjson_inline uint32_t next_tape_index(json_iterator &iter) const noexcept;
|
|
simdjson_inline void start_container(json_iterator &iter) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_warn_unused simdjson_inline error_code empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept;
|
|
simdjson_inline uint8_t *on_start_string(json_iterator &iter) noexcept;
|
|
simdjson_inline void on_end_string(uint8_t *dst) noexcept;
|
|
}; // struct tape_builder
|
|
|
|
template<bool STREAMING>
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::parse_document(
|
|
dom_parser_implementation &dom_parser,
|
|
dom::document &doc) noexcept {
|
|
dom_parser.doc = &doc;
|
|
json_iterator iter(dom_parser, STREAMING ? dom_parser.next_structural_index : 0);
|
|
tape_builder builder(doc);
|
|
return iter.walk_document<STREAMING>(builder);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_root_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_primitive(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return iter.visit_primitive(*this, value);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_object(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_empty_array(json_iterator &iter) noexcept {
|
|
return empty_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_start(json_iterator &iter) noexcept {
|
|
start_container(iter);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_object_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_OBJECT, internal::tape_type::END_OBJECT);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_array_end(json_iterator &iter) noexcept {
|
|
return end_container(iter, internal::tape_type::START_ARRAY, internal::tape_type::END_ARRAY);
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_document_end(json_iterator &iter) noexcept {
|
|
constexpr uint32_t start_tape_index = 0;
|
|
tape.append(start_tape_index, internal::tape_type::ROOT);
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter), internal::tape_type::ROOT);
|
|
return SUCCESS;
|
|
}
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_key(json_iterator &iter, const uint8_t *key) noexcept {
|
|
return visit_string(iter, key, true);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::increment_count(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].count++; // we have a key value pair in the object at parser.dom_parser.depth - 1
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline tape_builder::tape_builder(dom::document &doc) noexcept : tape{doc.tape.get()}, current_string_buf_loc{doc.string_buf.get()} {}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_string(json_iterator &iter, const uint8_t *value, bool key) noexcept {
|
|
iter.log_value(key ? "key" : "string");
|
|
uint8_t *dst = on_start_string(iter);
|
|
dst = stringparsing::parse_string(value+1, dst, false); // We do not allow replacement when the escape characters are invalid.
|
|
if (dst == nullptr) {
|
|
iter.log_error("Invalid escape in string");
|
|
return STRING_ERROR;
|
|
}
|
|
on_end_string(dst);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_string(json_iterator &iter, const uint8_t *value) noexcept {
|
|
return visit_string(iter, value);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("number");
|
|
return numberparsing::parse_number(value, tape);
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_number(json_iterator &iter, const uint8_t *value) noexcept {
|
|
//
|
|
// We need to make a copy to make sure that the string is space terminated.
|
|
// This is not about padding the input, which should already padded up
|
|
// to len + SIMDJSON_PADDING. However, we have no control at this stage
|
|
// on how the padding was done. What if the input string was padded with nulls?
|
|
// It is quite common for an input string to have an extra null character (C string).
|
|
// We do not want to allow 9\0 (where \0 is the null character) inside a JSON
|
|
// document, but the string "9\0" by itself is fine. So we make a copy and
|
|
// pad the input with spaces when we know that there is just one input element.
|
|
// This copy is relatively expensive, but it will almost never be called in
|
|
// practice unless you are in the strange scenario where you have many JSON
|
|
// documents made of single atoms.
|
|
//
|
|
std::unique_ptr<uint8_t[]>copy(new (std::nothrow) uint8_t[iter.remaining_len() + SIMDJSON_PADDING]);
|
|
if (copy.get() == nullptr) { return MEMALLOC; }
|
|
std::memcpy(copy.get(), value, iter.remaining_len());
|
|
std::memset(copy.get() + iter.remaining_len(), ' ', SIMDJSON_PADDING);
|
|
error_code error = visit_number(iter, copy.get());
|
|
return error;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value)) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_true_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("true");
|
|
if (!atomparsing::is_valid_true_atom(value, iter.remaining_len())) { return T_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::TRUE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value)) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_false_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("false");
|
|
if (!atomparsing::is_valid_false_atom(value, iter.remaining_len())) { return F_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::FALSE_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value)) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::visit_root_null_atom(json_iterator &iter, const uint8_t *value) noexcept {
|
|
iter.log_value("null");
|
|
if (!atomparsing::is_valid_null_atom(value, iter.remaining_len())) { return N_ATOM_ERROR; }
|
|
tape.append(0, internal::tape_type::NULL_VALUE);
|
|
return SUCCESS;
|
|
}
|
|
|
|
// private:
|
|
|
|
simdjson_inline uint32_t tape_builder::next_tape_index(json_iterator &iter) const noexcept {
|
|
return uint32_t(tape.next_tape_loc - iter.dom_parser.doc->tape.get());
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::empty_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
auto start_index = next_tape_index(iter);
|
|
tape.append(start_index+2, start);
|
|
tape.append(start_index, end);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline void tape_builder::start_container(json_iterator &iter) noexcept {
|
|
iter.dom_parser.open_containers[iter.depth].tape_index = next_tape_index(iter);
|
|
iter.dom_parser.open_containers[iter.depth].count = 0;
|
|
tape.skip(); // We don't actually *write* the start element until the end.
|
|
}
|
|
|
|
simdjson_warn_unused simdjson_inline error_code tape_builder::end_container(json_iterator &iter, internal::tape_type start, internal::tape_type end) noexcept {
|
|
// Write the ending tape element, pointing at the start location
|
|
const uint32_t start_tape_index = iter.dom_parser.open_containers[iter.depth].tape_index;
|
|
tape.append(start_tape_index, end);
|
|
// Write the start tape element, pointing at the end location (and including count)
|
|
// count can overflow if it exceeds 24 bits... so we saturate
|
|
// the convention being that a cnt of 0xffffff or more is undetermined in value (>= 0xffffff).
|
|
const uint32_t count = iter.dom_parser.open_containers[iter.depth].count;
|
|
const uint32_t cntsat = count > 0xFFFFFF ? 0xFFFFFF : count;
|
|
tape_writer::write(iter.dom_parser.doc->tape[start_tape_index], next_tape_index(iter) | (uint64_t(cntsat) << 32), start);
|
|
return SUCCESS;
|
|
}
|
|
|
|
simdjson_inline uint8_t *tape_builder::on_start_string(json_iterator &iter) noexcept {
|
|
// we advance the point, accounting for the fact that we have a NULL termination
|
|
tape.append(current_string_buf_loc - iter.dom_parser.doc->string_buf.get(), internal::tape_type::STRING);
|
|
return current_string_buf_loc + sizeof(uint32_t);
|
|
}
|
|
|
|
simdjson_inline void tape_builder::on_end_string(uint8_t *dst) noexcept {
|
|
uint32_t str_length = uint32_t(dst - (current_string_buf_loc + sizeof(uint32_t)));
|
|
// TODO check for overflow in case someone has a crazy string (>=4GB?)
|
|
// But only add the overflow check when the document itself exceeds 4GB
|
|
// Currently unneeded because we refuse to parse docs larger or equal to 4GB.
|
|
memcpy(current_string_buf_loc, &str_length, sizeof(uint32_t));
|
|
// NULL termination is still handy if you expect all your strings to
|
|
// be NULL terminated? It comes at a small cost
|
|
*dst = 0;
|
|
current_string_buf_loc = dst + 1;
|
|
}
|
|
|
|
} // namespace stage2
|
|
} // unnamed namespace
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
|
|
/* end file generic/stage2/tape_builder.h for fallback */
|
|
|
|
//
|
|
// Stage 1
|
|
//
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
|
|
size_t capacity,
|
|
size_t max_depth,
|
|
std::unique_ptr<internal::dom_parser_implementation>& dst
|
|
) const noexcept {
|
|
dst.reset( new (std::nothrow) fallback::dom_parser_implementation() );
|
|
if (!dst) { return MEMALLOC; }
|
|
if (auto err = dst->set_capacity(capacity))
|
|
return err;
|
|
if (auto err = dst->set_max_depth(max_depth))
|
|
return err;
|
|
return SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
namespace stage1 {
|
|
|
|
class structural_scanner {
|
|
public:
|
|
|
|
simdjson_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial)
|
|
: buf{_parser.buf},
|
|
next_structural_index{_parser.structural_indexes.get()},
|
|
parser{_parser},
|
|
len{static_cast<uint32_t>(_parser.len)},
|
|
partial{_partial} {
|
|
}
|
|
|
|
simdjson_inline void add_structural() {
|
|
*next_structural_index = idx;
|
|
next_structural_index++;
|
|
}
|
|
|
|
simdjson_inline bool is_continuation(uint8_t c) {
|
|
return (c & 0xc0) == 0x80;
|
|
}
|
|
|
|
simdjson_inline void validate_utf8_character() {
|
|
// Continuation
|
|
if (simdjson_unlikely((buf[idx] & 0x40) == 0)) {
|
|
// extra continuation
|
|
error = UTF8_ERROR;
|
|
idx++;
|
|
return;
|
|
}
|
|
|
|
// 2-byte
|
|
if ((buf[idx] & 0x20) == 0) {
|
|
// missing continuation
|
|
if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
|
|
if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
|
|
error = UTF8_ERROR;
|
|
idx++;
|
|
return;
|
|
}
|
|
// overlong: 1100000_ 10______
|
|
if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
|
|
idx += 2;
|
|
return;
|
|
}
|
|
|
|
// 3-byte
|
|
if ((buf[idx] & 0x10) == 0) {
|
|
// missing continuation
|
|
if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
|
|
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
|
|
error = UTF8_ERROR;
|
|
idx++;
|
|
return;
|
|
}
|
|
// overlong: 11100000 100_____ ________
|
|
if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
|
|
// surrogates: U+D800-U+DFFF 11101101 101_____
|
|
if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
|
|
idx += 3;
|
|
return;
|
|
}
|
|
|
|
// 4-byte
|
|
// missing continuation
|
|
if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
|
|
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
|
|
error = UTF8_ERROR;
|
|
idx++;
|
|
return;
|
|
}
|
|
// overlong: 11110000 1000____ ________ ________
|
|
if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
|
|
// too large: > U+10FFFF:
|
|
// 11110100 (1001|101_)____
|
|
// 1111(1___|011_|0101) 10______
|
|
// also includes 5, 6, 7 and 8 byte characters:
|
|
// 11111___
|
|
if (buf[idx] == 0xf4 && buf[idx+1] >= 0x90) { error = UTF8_ERROR; }
|
|
if (buf[idx] >= 0xf5) { error = UTF8_ERROR; }
|
|
idx += 4;
|
|
}
|
|
|
|
// Returns true if the string is unclosed.
|
|
simdjson_inline bool validate_string() {
|
|
idx++; // skip first quote
|
|
while (idx < len && buf[idx] != '"') {
|
|
if (buf[idx] == '\\') {
|
|
idx += 2;
|
|
} else if (simdjson_unlikely(buf[idx] & 0x80)) {
|
|
validate_utf8_character();
|
|
} else {
|
|
if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
|
|
idx++;
|
|
}
|
|
}
|
|
if (idx >= len) { return true; }
|
|
return false;
|
|
}
|
|
|
|
simdjson_inline bool is_whitespace_or_operator(uint8_t c) {
|
|
switch (c) {
|
|
case '{': case '}': case '[': case ']': case ',': case ':':
|
|
case ' ': case '\r': case '\n': case '\t':
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Parse the entire input in STEP_SIZE-byte chunks.
|
|
//
|
|
simdjson_inline error_code scan() {
|
|
bool unclosed_string = false;
|
|
for (;idx<len;idx++) {
|
|
switch (buf[idx]) {
|
|
// String
|
|
case '"':
|
|
add_structural();
|
|
unclosed_string |= validate_string();
|
|
break;
|
|
// Operator
|
|
case '{': case '}': case '[': case ']': case ',': case ':':
|
|
add_structural();
|
|
break;
|
|
// Whitespace
|
|
case ' ': case '\r': case '\n': case '\t':
|
|
break;
|
|
// Primitive or invalid character (invalid characters will be checked in stage 2)
|
|
default:
|
|
// Anything else, add the structural and go until we find the next one
|
|
add_structural();
|
|
while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
|
|
idx++;
|
|
};
|
|
break;
|
|
}
|
|
}
|
|
// We pad beyond.
|
|
// https://github.com/simdjson/simdjson/issues/906
|
|
// See json_structural_indexer.h for an explanation.
|
|
*next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
|
|
next_structural_index[1] = len;
|
|
next_structural_index[2] = 0;
|
|
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
|
|
parser.next_structural_index = 0;
|
|
if (partial == stage1_mode::streaming_partial) {
|
|
if(unclosed_string) {
|
|
parser.n_structural_indexes--;
|
|
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
|
|
}
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
auto new_structural_indexes = find_next_document_index(parser);
|
|
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
|
|
if(parser.structural_indexes[0] == 0) {
|
|
// If the buffer is partial and we started at index 0 but the document is
|
|
// incomplete, it's too big to parse.
|
|
return CAPACITY;
|
|
} else {
|
|
// It is possible that the document could be parsed, we just had a lot
|
|
// of white space.
|
|
parser.n_structural_indexes = 0;
|
|
return EMPTY;
|
|
}
|
|
}
|
|
parser.n_structural_indexes = new_structural_indexes;
|
|
} else if(partial == stage1_mode::streaming_final) {
|
|
if(unclosed_string) { parser.n_structural_indexes--; }
|
|
// We truncate the input to the end of the last complete document (or zero).
|
|
// Because partial == stage1_mode::streaming_final, it means that we may
|
|
// silently ignore trailing garbage. Though it sounds bad, we do it
|
|
// deliberately because many people who have streams of JSON documents
|
|
// will truncate them for processing. E.g., imagine that you are uncompressing
|
|
// the data from a size file or receiving it in chunks from the network. You
|
|
// may not know where exactly the last document will be. Meanwhile the
|
|
// document_stream instances allow people to know the JSON documents they are
|
|
// parsing (see the iterator.source() method).
|
|
parser.n_structural_indexes = find_next_document_index(parser);
|
|
// We store the initial n_structural_indexes so that the client can see
|
|
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
|
|
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
|
|
// otherwise, it will copy some prior index.
|
|
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
|
|
// This next line is critical, do not change it unless you understand what you are
|
|
// doing.
|
|
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
|
|
if (parser.n_structural_indexes == 0) { return EMPTY; }
|
|
} else if(unclosed_string) { error = UNCLOSED_STRING; }
|
|
return error;
|
|
}
|
|
|
|
private:
|
|
const uint8_t *buf;
|
|
uint32_t *next_structural_index;
|
|
dom_parser_implementation &parser;
|
|
uint32_t len;
|
|
uint32_t idx{0};
|
|
error_code error{SUCCESS};
|
|
stage1_mode partial;
|
|
}; // structural_scanner
|
|
|
|
} // namespace stage1
|
|
} // unnamed namespace
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept {
|
|
this->buf = _buf;
|
|
this->len = _len;
|
|
stage1::structural_scanner scanner(*this, partial);
|
|
return scanner.scan();
|
|
}
|
|
|
|
// big table for the minifier
|
|
static uint8_t jump_table[256 * 3] = {
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
|
|
1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
|
|
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
|
|
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
|
|
};
|
|
|
|
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
|
|
size_t i = 0, pos = 0;
|
|
uint8_t quote = 0;
|
|
uint8_t nonescape = 1;
|
|
|
|
while (i < len) {
|
|
unsigned char c = buf[i];
|
|
uint8_t *meta = jump_table + 3 * c;
|
|
|
|
quote = quote ^ (meta[0] & nonescape);
|
|
dst[pos] = c;
|
|
pos += meta[2] | quote;
|
|
|
|
i += 1;
|
|
nonescape = uint8_t(~nonescape) | (meta[1]);
|
|
}
|
|
dst_len = pos; // we intentionally do not work with a reference
|
|
// for fear of aliasing
|
|
return quote ? UNCLOSED_STRING : SUCCESS;
|
|
}
|
|
|
|
// credit: based on code from Google Fuchsia (Apache Licensed)
|
|
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
|
|
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
|
|
uint64_t pos = 0;
|
|
uint32_t code_point = 0;
|
|
while (pos < len) {
|
|
// check of the next 8 bytes are ascii.
|
|
uint64_t next_pos = pos + 16;
|
|
if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
|
|
uint64_t v1;
|
|
memcpy(&v1, data + pos, sizeof(uint64_t));
|
|
uint64_t v2;
|
|
memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
|
|
uint64_t v{v1 | v2};
|
|
if ((v & 0x8080808080808080) == 0) {
|
|
pos = next_pos;
|
|
continue;
|
|
}
|
|
}
|
|
unsigned char byte = data[pos];
|
|
if (byte < 0x80) {
|
|
pos++;
|
|
continue;
|
|
} else if ((byte & 0xe0) == 0xc0) {
|
|
next_pos = pos + 2;
|
|
if (next_pos > len) { return false; }
|
|
if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
|
|
// range check
|
|
code_point = (byte & 0x1f) << 6 | (data[pos + 1] & 0x3f);
|
|
if (code_point < 0x80 || 0x7ff < code_point) { return false; }
|
|
} else if ((byte & 0xf0) == 0xe0) {
|
|
next_pos = pos + 3;
|
|
if (next_pos > len) { return false; }
|
|
if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
|
|
if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
|
|
// range check
|
|
code_point = (byte & 0x0f) << 12 |
|
|
(data[pos + 1] & 0x3f) << 6 |
|
|
(data[pos + 2] & 0x3f);
|
|
if (code_point < 0x800 || 0xffff < code_point ||
|
|
(0xd7ff < code_point && code_point < 0xe000)) {
|
|
return false;
|
|
}
|
|
} else if ((byte & 0xf8) == 0xf0) { // 0b11110000
|
|
next_pos = pos + 4;
|
|
if (next_pos > len) { return false; }
|
|
if ((data[pos + 1] & 0xc0) != 0x80) { return false; }
|
|
if ((data[pos + 2] & 0xc0) != 0x80) { return false; }
|
|
if ((data[pos + 3] & 0xc0) != 0x80) { return false; }
|
|
// range check
|
|
code_point =
|
|
(byte & 0x07) << 18 | (data[pos + 1] & 0x3f) << 12 |
|
|
(data[pos + 2] & 0x3f) << 6 | (data[pos + 3] & 0x3f);
|
|
if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
|
|
} else {
|
|
// we may have a continuation
|
|
return false;
|
|
}
|
|
pos = next_pos;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
//
|
|
// Stage 2
|
|
//
|
|
|
|
namespace simdjson {
|
|
namespace fallback {
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<false>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
|
|
return stage2::tape_builder::parse_document<true>(*this, _doc);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
|
|
return fallback::stringparsing::parse_string(src, dst, replacement_char);
|
|
}
|
|
|
|
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
|
|
return fallback::stringparsing::parse_wobbly_string(src, dst);
|
|
}
|
|
|
|
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
|
|
auto error = stage1(_buf, _len, stage1_mode::regular);
|
|
if (error) { return error; }
|
|
return stage2(_doc);
|
|
}
|
|
|
|
} // namespace fallback
|
|
} // namespace simdjson
|
|
|
|
/* including simdjson/fallback/end.h: #include <simdjson/fallback/end.h> */
|
|
/* begin file simdjson/fallback/end.h */
|
|
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
|
|
/* amalgamation skipped (editor-only): #include "simdjson/fallback/base.h" */
|
|
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
|
|
|
|
/* undefining SIMDJSON_IMPLEMENTATION from "fallback" */
|
|
#undef SIMDJSON_IMPLEMENTATION
|
|
/* end file simdjson/fallback/end.h */
|
|
|
|
#endif // SIMDJSON_SRC_FALLBACK_CPP
|
|
/* end file fallback.cpp */
|
|
#endif
|
|
/* undefining SIMDJSON_CONDITIONAL_INCLUDE */
|
|
#undef SIMDJSON_CONDITIONAL_INCLUDE
|
|
|
|
SIMDJSON_POP_DISABLE_UNUSED_WARNINGS
|
|
|
|
/* end file simdjson.cpp */
|