axmol/thirdparty/xsxml/no-recursive/xsxml.hpp

1678 lines
50 KiB
C++
Raw Normal View History

2020-11-16 14:47:43 +08:00
//////////////////////////////////////////////////////////////////////////////////////////
// The embedded xml SAX parser, extract from pugixml DOM parser
// please see: https://github.com/zeux/pugixml
//////////////////////////////////////////////////////////////////////////////////////////
/*
The MIT License (MIT)
Copyright (c) 2019 halx99
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef SIMDSOFT__XSXML_HPP
#define SIMDSOFT__XSXML_HPP
#pragma once
#include <functional>
#include <string>
#include <assert.h>
#define XSXML__DECL inline
namespace xsxml
{
typedef char char_t;
// Parsing status, returned as part of xml_parse_result object
enum xml_parse_status
{
status_ok = 0, // No error
status_file_not_found, // File was not found during load_file()
status_io_error, // Error reading from file/stream
status_out_of_memory, // Could not allocate memory
status_internal_error, // Internal error occurred
status_unrecognized_tag, // Parser could not determine tag type
status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction
status_bad_comment, // Parsing error occurred while parsing comment
status_bad_cdata, // Parsing error occurred while parsing CDATA section
status_bad_doctype, // Parsing error occurred while parsing document type declaration
status_bad_pcdata, // Parsing error occurred while parsing PCDATA section
status_bad_start_element, // Parsing error occurred while parsing start element tag
status_bad_attribute, // Parsing error occurred while parsing element attribute
status_bad_end_element, // Parsing error occurred while parsing end element tag
status_end_element_mismatch, // There was a mismatch of start-end tags (closing tag had incorrect
// name, some tag was not closed or there was an excessive closing
// tag)
status_append_invalid_root, // Unable to append nodes since root type is not node_element or
// node_document (exclusive to xml_node::append_buffer)
status_no_document_element // Parsing resulted in a document without element nodes
};
// Parsing options
// Minimal parsing mode (equivalent to turning all other flags off).
// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
const unsigned int parse_minimal = 0x0000;
// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is
// off by default.
const unsigned int parse_pi = 0x0001;
// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by
// default.
const unsigned int parse_comments = 0x0002;
// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by
// default.
const unsigned int parse_cdata = 0x0004;
// This flag determines if plain character data (node_pcdata) that consist only of whitespace are
// added to the DOM tree. This flag is off by default; turning it on usually results in slower
// parsing and more memory consumption.
const unsigned int parse_ws_pcdata = 0x0008;
// This flag determines if character and entity references are expanded during parsing. This flag is
// on by default.
const unsigned int parse_escapes = 0x0010;
// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This
// flag is on by default.
const unsigned int parse_eol = 0x0020;
// This flag determines if attribute values are normalized using CDATA normalization rules during
// parsing. This flag is on by default.
const unsigned int parse_wconv_attribute = 0x0040;
// This flag determines if attribute values are normalized using NMTOKENS normalization rules during
// parsing. This flag is off by default.
const unsigned int parse_wnorm_attribute = 0x0080;
// This flag determines if document declaration (node_declaration) is added to the DOM tree. This
// flag is off by default.
const unsigned int parse_declaration = 0x0100;
// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This
// flag is off by default.
const unsigned int parse_doctype = 0x0200;
// This flag determines if plain character data (node_pcdata) that is the only child of the parent
// node and that consists only of whitespace is added to the DOM tree. This flag is off by default;
// turning it on may result in slower parsing and more memory consumption.
const unsigned int parse_ws_pcdata_single = 0x0400;
// This flag determines if leading and trailing whitespace is to be removed from plain character
// data. This flag is off by default.
const unsigned int parse_trim_pcdata = 0x0800;
// This flag determines if plain character data that does not have a parent node is added to the DOM
// tree, and if an empty document is a valid document. This flag is off by default.
const unsigned int parse_fragment = 0x1000;
// This flag determines if plain character data is be stored in the parent element's value. This
// significantly changes the structure of the document; this flag is only recommended for parsing
// documents with many PCDATA nodes in memory-constrained environments. This flag is off by default.
const unsigned int parse_embed_pcdata = 0x2000;
// The default parsing mode.
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are
// expanded, End-of-Line characters are normalized, attribute values are normalized using CDATA
// normalization rules.
const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
// The full parsing mode.
// Nodes of all types are added to the DOM tree, character/reference entities are expanded,
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization
// rules.
const unsigned int parse_full =
parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
// The max parse deep of xml
// Don't define it to large, otherwise, will lead stack overflow
const unsigned int parse_max_deep = 512;
typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
typedef char_t* (*strconv_pcdata_t)(char_t*);
enum chartype_t
{
ct_parse_pcdata = 1, // \0, &, \r, <
ct_parse_attr = 2, // \0, &, \r, ', "
ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab
ct_space = 8, // \r, \n, space, tab
ct_parse_cdata = 16, // \0, ], >, \r
ct_parse_comment = 32, // \0, -, >, \r
ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, :
};
static const unsigned char chartype_table[256] = {
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192};
enum chartypex_t
{
ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, "
ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _
ctx_digit = 8, // 0-9
ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
};
static const unsigned char chartypex_table[256] = {
3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31
0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63
0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95
0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
// Branch weight controls
#if defined(__GNUC__)
# define XSXML__UNLIKELY(cond) __builtin_expect(cond, 0)
#else
# define XSXML__UNLIKELY(cond) (cond)
#endif
#define XSXML__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
#define XSXML__IS_CHARTYPE(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartype_table)
#define XSXML__IS_CHARTYPEX(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
// Parser utilities
#define XSXML__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
#define XSXML__SKIPWS() \
{ \
while (XSXML__IS_CHARTYPE(*s, ct_space)) \
++s; \
}
#define XSXML__OPTSET(OPT) (optmsk & (OPT))
#define XSXML__PUSHNODE(TYPE) \
{ \
cursor = append_new_node(cursor, alloc, TYPE); \
if (!cursor) \
XSXML__THROW_ERROR(status_out_of_memory, s); \
}
#define XSXML__POPNODE() \
{ \
cursor = cursor->parent; \
}
#define XSXML__SCANFOR(X) \
{ \
while (*s != 0 && !(X)) \
++s; \
}
#define XSXML__SCANWHILE(X) \
{ \
while (X) \
++s; \
}
#define XSXML__SCANWHILE_UNROLL(X) \
{ \
for (;;) \
{ \
char_t ss = s[0]; \
if (XSXML__UNLIKELY(!(X))) \
{ \
break; \
} \
ss = s[1]; \
if (XSXML__UNLIKELY(!(X))) \
{ \
s += 1; \
break; \
} \
ss = s[2]; \
if (XSXML__UNLIKELY(!(X))) \
{ \
s += 2; \
break; \
} \
ss = s[3]; \
if (XSXML__UNLIKELY(!(X))) \
{ \
s += 3; \
break; \
} \
s += 4; \
} \
}
#define XSXML__ENDSEG() \
{ \
ch = *s; \
*s = 0; \
++s; \
}
#define XSXML__THROW_ERROR(err, m) \
return error_offset = m, error_status = err, static_cast<char_t*>(0)
#define XSXML__CHECK_ERROR(err, m) \
{ \
if (*s == 0) \
XSXML__THROW_ERROR(err, m); \
}
// Simple static assertion
#define XSXML__STATIC_ASSERT(cond) \
{ \
static const char condition_failed[(cond) ? 1 : -1] = {0}; \
(void)condition_failed[0]; \
}
// Parsing result
struct xml_parse_result
{
// Parsing status (see xml_parse_status)
xml_parse_status status;
// Last parsed offset (in char_t units from start of input data)
ptrdiff_t offset;
// Source document encoding
// xml_encoding encoding;
// Default constructor, initializes object to failed state
xml_parse_result() : status(status_internal_error), offset(0) {}
// Cast to bool operator
operator bool() const { return status == status_ok; }
// Get error description
const char* description() const;
};
struct opt_false
{
enum
{
value = 0
};
};
struct opt_true
{
enum
{
value = 1
};
};
struct gap
{
char_t* end;
size_t size;
gap() : end(0), size(0) {}
// Push new gap, move s count bytes further (skipping the gap).
// Collapse previous gap.
void push(char_t*& s, size_t count)
{
if (end) // there was a gap already; collapse it
{
// Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
assert(s >= end);
memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
}
s += count; // end of current gap
// "merge" two gaps
end = s;
size += count;
}
// Collapse all gaps, return past-the-end pointer
char_t* flush(char_t* s)
{
if (end)
{
// Move [old_gap_end, current_pos) to [old_gap_start, ...)
assert(s >= end);
memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
return s - size;
}
else
return s;
}
};
template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
{
static char_t* parse(char_t* s)
{
gap g;
char_t* begin = s;
while (true)
{
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_pcdata));
if (*s == '<') // PCDATA ends here
{
char_t* end = g.flush(s);
if (opt_trim::value)
while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space))
--end;
*end = 0;
return s + 1;
}
else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n')
g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (*s == 0)
{
char_t* end = g.flush(s);
if (opt_trim::value)
while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space))
--end;
*end = 0;
return s;
}
else
++s;
}
}
};
XSXML__DECL strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
{
XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
{
case 0:
return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
case 1:
return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
case 2:
return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
case 3:
return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
case 4:
return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
case 5:
return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
case 6:
return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
case 7:
return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
default:
assert(false);
return 0; // should not get here
}
}
struct utf8_writer
{
typedef uint8_t* value_type;
static value_type low(value_type result, uint32_t ch)
{
// U+0000..U+007F
if (ch < 0x80)
{
*result = static_cast<uint8_t>(ch);
return result + 1;
}
// U+0080..U+07FF
else if (ch < 0x800)
{
result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
return result + 2;
}
// U+0800..U+FFFF
else
{
result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
return result + 3;
}
}
static value_type high(value_type result, uint32_t ch)
{
// U+10000..U+10FFFF
result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
return result + 4;
}
static value_type any(value_type result, uint32_t ch)
{
return (ch < 0x10000) ? low(result, ch) : high(result, ch);
}
};
XSXML__DECL char_t* strconv_escape(char_t* s, gap& g)
{
char_t* stre = s + 1;
switch (*stre)
{
case '#': // &#...
{
unsigned int ucsc = 0;
if (stre[1] == 'x') // &#x... (hex code)
{
stre += 2;
char_t ch = *stre;
if (ch == ';')
return stre;
for (;;)
{
if (static_cast<unsigned int>(ch - '0') <= 9)
ucsc = 16 * ucsc + (ch - '0');
else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
else if (ch == ';')
break;
else // cancel
return stre;
ch = *++stre;
}
++stre;
}
else // &#... (dec code)
{
char_t ch = *++stre;
if (ch == ';')
return stre;
for (;;)
{
if (static_cast<unsigned int>(static_cast<unsigned int>(ch) - '0') <= 9)
ucsc = 10 * ucsc + (ch - '0');
else if (ch == ';')
break;
else // cancel
return stre;
ch = *++stre;
}
++stre;
}
s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
g.push(s, stre - s);
return stre;
}
case 'a': // &a
{
++stre;
if (*stre == 'm') // &am
{
if (*++stre == 'p' && *++stre == ';') // &amp;
{
*s++ = '&';
++stre;
g.push(s, stre - s);
return stre;
}
}
else if (*stre == 'p') // &ap
{
if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
{
*s++ = '\'';
++stre;
g.push(s, stre - s);
return stre;
}
}
break;
}
case 'g': // &g
{
if (*++stre == 't' && *++stre == ';') // &gt;
{
*s++ = '>';
++stre;
g.push(s, stre - s);
return stre;
}
break;
}
case 'l': // &l
{
if (*++stre == 't' && *++stre == ';') // &lt;
{
*s++ = '<';
++stre;
g.push(s, stre - s);
return stre;
}
break;
}
case 'q': // &q
{
if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
{
*s++ = '"';
++stre;
g.push(s, stre - s);
return stre;
}
break;
}
default:
break;
}
return stre;
}
template <typename opt_escape> struct strconv_attribute_impl
{
static char_t* parse_wnorm(char_t* s, char_t end_quote)
{
gap g;
// trim leading whitespaces
if (XSXML__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s;
do
++str;
while (XSXML__IS_CHARTYPE(*str, ct_space));
g.push(s, str - s);
}
while (true)
{
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
if (*s == end_quote)
{
char_t* str = g.flush(s);
do
*str-- = 0;
while (XSXML__IS_CHARTYPE(*str, ct_space));
return s + 1;
}
else if (XSXML__IS_CHARTYPE(*s, ct_space))
{
*s++ = ' ';
if (XSXML__IS_CHARTYPE(*s, ct_space))
{
char_t* str = s + 1;
while (XSXML__IS_CHARTYPE(*str, ct_space))
++str;
g.push(s, str - s);
}
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else
++s;
}
}
static char_t* parse_wconv(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
else if (XSXML__IS_CHARTYPE(*s, ct_space))
{
if (*s == '\r')
{
*s++ = ' ';
if (*s == '\n')
g.push(s, 1);
}
else
*s++ = ' ';
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else
++s;
}
}
static char_t* parse_eol(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
else if (*s == '\r')
{
*s++ = '\n';
if (*s == '\n')
g.push(s, 1);
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else
++s;
}
}
static char_t* parse_simple(char_t* s, char_t end_quote)
{
gap g;
while (true)
{
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr));
if (*s == end_quote)
{
*g.flush(s) = 0;
return s + 1;
}
else if (opt_escape::value && *s == '&')
{
s = strconv_escape(s, g);
}
else if (!*s)
{
return 0;
}
else
++s;
}
}
};
XSXML__DECL xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
{
xml_parse_result result;
result.status = status;
result.offset = offset;
return result;
}
XSXML__DECL char_t* strconv_comment(char_t* s, char_t endch)
{
gap g;
while (true)
{
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_comment));
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n')
g.push(s, 1);
}
else if (s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>')) // comment ends here
{
*g.flush(s) = 0;
return s + (s[2] == '>' ? 3 : 2);
}
else if (*s == 0)
{
return 0;
}
else
++s;
}
}
XSXML__DECL char_t* strconv_cdata(char_t* s, char_t endch)
{
gap g;
while (true)
{
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_cdata));
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
{
*s++ = '\n'; // replace first one with 0x0a
if (*s == '\n')
g.push(s, 1);
}
else if (s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')) // CDATA ends here
{
*g.flush(s) = 0;
return s + 1;
}
else if (*s == 0)
{
return 0;
}
else
++s;
}
}
XSXML__DECL strconv_attribute_t get_strconv_attribute(unsigned int optmask)
{
XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 &&
parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
{
case 0:
return strconv_attribute_impl<opt_false>::parse_simple;
case 1:
return strconv_attribute_impl<opt_true>::parse_simple;
case 2:
return strconv_attribute_impl<opt_false>::parse_eol;
case 3:
return strconv_attribute_impl<opt_true>::parse_eol;
case 4:
return strconv_attribute_impl<opt_false>::parse_wconv;
case 5:
return strconv_attribute_impl<opt_true>::parse_wconv;
case 6:
return strconv_attribute_impl<opt_false>::parse_wconv;
case 7:
return strconv_attribute_impl<opt_true>::parse_wconv;
case 8:
return strconv_attribute_impl<opt_false>::parse_wnorm;
case 9:
return strconv_attribute_impl<opt_true>::parse_wnorm;
case 10:
return strconv_attribute_impl<opt_false>::parse_wnorm;
case 11:
return strconv_attribute_impl<opt_true>::parse_wnorm;
case 12:
return strconv_attribute_impl<opt_false>::parse_wnorm;
case 13:
return strconv_attribute_impl<opt_true>::parse_wnorm;
case 14:
return strconv_attribute_impl<opt_false>::parse_wnorm;
case 15:
return strconv_attribute_impl<opt_true>::parse_wnorm;
default:
assert(false);
return 0; // should not get here
}
}
// Skip utf-8 bom
static char_t* parse_skip_bom(char_t* s)
{
return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
}
// Simple string view
class string_view
{
public:
string_view() : _Mystr(nullptr), _Mysize(0) {}
string_view(char_t* str, size_t size) : _Mystr(str), _Mysize(size) {}
const char* c_str() const { return _Mystr != nullptr ? _Mystr : ""; }
size_t length() const { return _Mysize; }
bool empty() const { return _Mysize == 0; }
private:
char_t* _Mystr;
size_t _Mysize;
};
// The sax3 parse callbacks
struct xml_sax3_parse_cb
{
std::function<void(char* name, size_t)> xml_start_element_cb;
std::function<void(const char* name, size_t, const char* value, size_t)> xml_attr_cb;
std::function<void()> xml_end_attr_cb;
std::function<void(const char* name, size_t)> xml_end_element_cb;
std::function<void(const char* text, size_t len)> xml_text_cb;
};
/////////////// xml_sax3_parser ///////////
struct xml_sax3_parser
{
// xml_allocator alloc;
char_t* error_offset;
xml_parse_status error_status;
xml_sax3_parse_cb* handler;
xml_sax3_parser(xml_sax3_parse_cb* handler_)
: handler(handler_), error_offset(0), error_status(status_ok)
{}
~xml_sax3_parser()
{
// *alloc_state = alloc;
}
// DOCTYPE consists of nested sections of the following possible types:
// <!-- ... -->, <? ... ?>, "...", '...'
// <![...]]>
// <!...>
// First group can not contain nested groups
// Second group can contain nested groups of the same type
// Third group can contain all other groups
char_t* parse_doctype_primitive(char_t* s)
{
if (*s == '"' || *s == '\'')
{
// quoted string
char_t ch = *s++;
XSXML__SCANFOR(*s == ch);
if (!*s)
XSXML__THROW_ERROR(status_bad_doctype, s);
s++;
}
else if (s[0] == '<' && s[1] == '?')
{
// <? ... ?>
s += 2;
XSXML__SCANFOR(s[0] == '?' &&
s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
if (!*s)
XSXML__THROW_ERROR(status_bad_doctype, s);
s += 2;
}
else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
{
s += 4;
XSXML__SCANFOR(s[0] == '-' && s[1] == '-' &&
s[2] ==
'>'); // no need for ENDSWITH because --> can't terminate proper doctype
if (!*s)
XSXML__THROW_ERROR(status_bad_doctype, s);
s += 3;
}
else
XSXML__THROW_ERROR(status_bad_doctype, s);
return s;
}
char_t* parse_doctype_ignore(char_t* s)
{
size_t depth = 0;
assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
s += 3;
while (*s)
{
if (s[0] == '<' && s[1] == '!' && s[2] == '[')
{
// nested ignore section
s += 3;
depth++;
}
else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
{
// ignore section end
s += 3;
if (depth == 0)
return s;
depth--;
}
else
s++;
}
XSXML__THROW_ERROR(status_bad_doctype, s);
}
char_t* parse_doctype_group(char_t* s, char_t endch)
{
size_t depth = 0;
assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
s += 2;
while (*s)
{
if (s[0] == '<' && s[1] == '!' && s[2] != '-')
{
if (s[2] == '[')
{
// ignore
s = parse_doctype_ignore(s);
if (!s)
return s;
}
else
{
// some control group
s += 2;
depth++;
}
}
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
{
// unknown tag (forbidden), or some primitive group
s = parse_doctype_primitive(s);
if (!s)
return s;
}
else if (*s == '>')
{
if (depth == 0)
return s;
depth--;
s++;
}
else
s++;
}
if (depth != 0 || endch != '>')
XSXML__THROW_ERROR(status_bad_doctype, s);
return s;
}
char_t* parse_exclamation(char_t* s, unsigned int optmsk, char_t endch)
{
// parse node contents, starting with exclamation mark
++s;
if (*s == '-') // '<!-...'
{
++s;
if (*s == '-') // '<!--...'
{
++s;
char_t* value = nullptr;
if (XSXML__OPTSET(parse_comments))
{
// SAX3: Ignore comment.
// XSXML__PUSHNODE(node_comment); // Append a new node on the tree.
// cursor->value = s; // Save the offset.
value = s;
}
if (XSXML__OPTSET(parse_eol) && XSXML__OPTSET(parse_comments))
{
s = strconv_comment(s, endch);
if (!s)
XSXML__THROW_ERROR(status_bad_comment, value);
}
else
{
// Scan for terminating '-->'.
XSXML__SCANFOR(s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>'));
XSXML__CHECK_ERROR(status_bad_comment, s);
if (XSXML__OPTSET(parse_comments))
*s = 0; // Zero-terminate this segment at the first terminating '-'.
s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
}
}
else
XSXML__THROW_ERROR(status_bad_comment, s);
}
else if (*s == '[')
{
// '<![CDATA[...'
if (*++s == 'C' && *++s == 'D' && *++s == 'A' && *++s == 'T' && *++s == 'A' && *++s == '[')
{
++s;
if (XSXML__OPTSET(parse_cdata))
{
// SAX3: Ignore CDATA
// XSXML__PUSHNODE(node_cdata); // Append a new node on the tree.
auto value = s; // Save the offset.
if (XSXML__OPTSET(parse_eol))
{
s = strconv_cdata(s, endch);
if (!s)
XSXML__THROW_ERROR(status_bad_cdata, value);
}
else
{
// Scan for terminating ']]>'.
XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>'));
XSXML__CHECK_ERROR(status_bad_cdata, s);
*s++ = 0; // Zero-terminate this segment.
}
}
else // Flagged for discard, but we still have to scan for the terminator.
{
// Scan for terminating ']]>'.
XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>'));
XSXML__CHECK_ERROR(status_bad_cdata, s);
++s;
}
s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
}
else
XSXML__THROW_ERROR(status_bad_cdata, s);
}
else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' &&
s[5] == 'P' && XSXML__ENDSWITH(s[6], 'E'))
{
s -= 2;
// TODO: check for doctype, parent must be nullptr
// if (cursor->parent) XSXML__THROW_ERROR(status_bad_doctype, s);
char_t* mark = s + 9;
s = parse_doctype_group(s, endch);
if (!s)
return s;
assert((*s == 0 && endch == '>') || *s == '>');
if (*s)
*s++ = 0;
if (XSXML__OPTSET(parse_doctype))
{
while (XSXML__IS_CHARTYPE(*mark, ct_space))
++mark;
// SAX3: Ignore doctype
// XSXML__PUSHNODE(node_doctype);
// cursor->value = mark;
}
}
else if (*s == 0 && endch == '-')
XSXML__THROW_ERROR(status_bad_comment, s);
else if (*s == 0 && endch == '[')
XSXML__THROW_ERROR(status_bad_cdata, s);
else
XSXML__THROW_ERROR(status_unrecognized_tag, s);
return s;
}
char_t* parse_question(char_t* s, unsigned int optmsk, char_t endch)
{
// load into registers
// xml_node_struct* cursor = ref_cursor;
char_t ch = 0;
// parse node contents, starting with question mark
++s;
// read PI target
char_t* target = s;
if (!XSXML__IS_CHARTYPE(*s, ct_start_symbol))
XSXML__THROW_ERROR(status_bad_pi, s);
XSXML__SCANWHILE(XSXML__IS_CHARTYPE(*s, ct_symbol));
XSXML__CHECK_ERROR(status_bad_pi, s);
// determine node type; stricmp / strcasecmp is not portable
bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' &&
(target[2] | ' ') == 'l' && target + 3 == s;
if (declaration ? XSXML__OPTSET(parse_declaration) : XSXML__OPTSET(parse_pi))
{
if (declaration)
{
// TODO: disallow non top-level declarations
// if (cursor->parent) XSXML__THROW_ERROR(status_bad_pi, s);
// SAX3: Ignore declaration.
// XSXML__PUSHNODE(node_declaration);
}
else
{
// SAX3: Ignore pi.
// XSXML__PUSHNODE(node_pi);
}
XSXML__ENDSEG();
// parse value/attributes
if (ch == '?')
{
// empty node
if (!XSXML__ENDSWITH(*s, '>'))
XSXML__THROW_ERROR(status_bad_pi, s);
s += (*s == '>');
// XSXML__POPNODE();
}
else if (XSXML__IS_CHARTYPE(ch, ct_space))
{
XSXML__SKIPWS();
// scan for tag end
char_t* value = s;
XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>'));
XSXML__CHECK_ERROR(status_bad_pi, s);
if (declaration)
{
// replace ending ? with / so that 'element' terminates properly
*s = '/';
// we exit from this function with cursor at node_declaration, which is a signal to
// parse() to go to LOC_ATTRIBUTES
s = value;
}
else
{
// store value and step over >
// cursor->value = value;
// XSXML__POPNODE();
XSXML__ENDSEG();
s += (*s == '>');
}
}
else
XSXML__THROW_ERROR(status_bad_pi, s);
}
else
{
// scan for tag end
XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>'));
XSXML__CHECK_ERROR(status_bad_pi, s);
s += (s[1] == '>' ? 2 : 1);
}
// store from registers
// ref_cursor = cursor;
return s;
}
template <typename _T, size_t _Capacity> struct fixed_stack
{
public:
fixed_stack() : size_(0) {}
void push(const _T& val)
{
if (size_ < _Capacity)
elements_[size_++] = val;
}
_T pop()
{
if (size_ > 0)
return elements_[size_-- - 1];
return _T{};
}
private:
_T elements_[_Capacity];
size_t size_;
};
char_t* parse_tree(char_t* s, unsigned int optmsk, char_t endch)
{
strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
char_t ch = 0;
char_t* mark = s;
char_t* value = nullptr;
size_t n = 0;
fixed_stack<string_view, parse_max_deep> stk; // 4K on 32bits, 6K on 64bits
while (*s != 0)
{
if (*s == '<')
{
++s;
LOC_TAG:
if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
{
// SAX3: TODO: xmlStartElement.
// XSXML__PUSHNODE(node_element); // Append a new node to the tree.
mark = s;
XSXML__SCANWHILE_UNROLL(XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
handler->xml_start_element_cb(mark, s - mark);
stk.push(::xsxml::string_view(mark, s - mark));
XSXML__ENDSEG(); // Save char in 'ch', terminate & step over.
if (ch == '>')
{
handler->xml_end_attr_cb(); // end of tag
}
else if (XSXML__IS_CHARTYPE(ch, ct_space))
{
while (true)
{ // parse attributes
XSXML__SKIPWS(); // Eat any whitespace.
if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
{
// SAX3: TODO: implement attribute.
// xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for
// this attribute. if (!a) XSXML__THROW_ERROR(status_out_of_memory, s);
mark = s; // Save the offset.
XSXML__SCANWHILE_UNROLL(
XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
n = s - mark;
XSXML__ENDSEG(); // Save char in 'ch', terminate & step over.
if (XSXML__IS_CHARTYPE(ch, ct_space))
{
XSXML__SKIPWS(); // Eat any whitespace.
ch = *s;
++s;
}
if (ch == '=') // '<... #=...'
{
XSXML__SKIPWS(); // Eat any whitespace.
if (*s == '"' || *s == '\'') // '<... #="...'
{
ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
++s; // Step over the quote.
value = s; // a->value = s; // Save the offset.
s = strconv_attribute(s, ch);
if (!s)
XSXML__THROW_ERROR(status_bad_attribute, value);
// After this line the loop continues from the start;
// Whitespaces, / and > are ok, symbols and EOF are wrong,
// everything else will be detected
if (XSXML__IS_CHARTYPE(*s, ct_start_symbol))
XSXML__THROW_ERROR(status_bad_attribute, s);
handler->xml_attr_cb(mark, n, value, s - value - 1);
}
else
XSXML__THROW_ERROR(status_bad_attribute, s);
}
else
XSXML__THROW_ERROR(status_bad_attribute, s);
}
else if (*s == '/')
{
++s;
if (*s == '>')
{
auto ele_name = stk.pop();
handler->xml_end_attr_cb();
handler->xml_end_element_cb(ele_name.c_str(), ele_name.length());
++s;
break;
}
else if (*s == 0 && endch == '>')
{
auto ele_name = stk.pop();
handler->xml_end_attr_cb();
handler->xml_end_element_cb(ele_name.c_str(), ele_name.length());
break;
}
else
XSXML__THROW_ERROR(status_bad_start_element, s);
}
else if (*s == '>')
{
++s;
handler->xml_end_attr_cb();
break;
}
else if (*s == 0 && endch == '>')
{
break;
}
else
XSXML__THROW_ERROR(status_bad_start_element, s);
}
// !!!
}
else if (ch == '/') // '<#.../'
{
if (!XSXML__ENDSWITH(*s, '>'))
XSXML__THROW_ERROR(status_bad_start_element, s);
stk.pop();
handler->xml_end_element_cb(mark, s - mark);
s += (*s == '>');
}
else if (ch == 0)
{
// we stepped over null terminator, backtrack & handle closing tag
--s;
if (endch != '>')
XSXML__THROW_ERROR(status_bad_start_element, s);
}
else
XSXML__THROW_ERROR(status_bad_start_element, s);
}
else if (*s == '/')
{
++s;
mark = s;
// SAX3, we don't check end element name
while (XSXML__IS_CHARTYPE(*s, ct_symbol))
++s;
stk.pop();
handler->xml_end_element_cb(mark, s - mark);
XSXML__SKIPWS();
if (*s == 0)
{
if (endch != '>')
XSXML__THROW_ERROR(status_bad_end_element, s);
}
else
{
if (*s != '>')
XSXML__THROW_ERROR(status_bad_end_element, s);
++s;
}
}
else if (*s == '?') // '<?...'
{
// SAX3: TODO: parse question.
s = parse_question(s, optmsk, endch);
if (!s)
return s;
// assert(cursor);
// if (XSXML__NODETYPE(cursor) == node_declaration) goto LOC_ATTRIBUTES;
// goto LOC_ATTRIBUTES; // SAX3: always regard as a valid node_declaration
}
else if (*s == '!') // '<!...'
{
// SAX3: TODO: parse exclamation.
s = parse_exclamation(s, optmsk, endch);
if (!s)
return s;
}
else if (*s == 0 && endch == '?')
XSXML__THROW_ERROR(status_bad_pi, s);
else
XSXML__THROW_ERROR(status_unrecognized_tag, s);
}
else
{
mark = s; // Save this offset while searching for a terminator.
XSXML__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
if (*s == '<' || !*s)
{
// We skipped some whitespace characters because otherwise we would take the tag branch
// instead of PCDATA one
assert(mark != s);
if (!XSXML__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) ||
XSXML__OPTSET(parse_trim_pcdata))
{
continue;
}
else if (XSXML__OPTSET(parse_ws_pcdata_single))
{
// SAX3: TODO: parse_ws_pcdata_single
// if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
}
}
if (!XSXML__OPTSET(parse_trim_pcdata))
s = mark;
// SAX3: Ignore node_pcdata.
if (/*cursor->parent ||*/ XSXML__OPTSET(parse_fragment))
{ // Currently, SAX3 simplely skip, do not regard text it node
if (XSXML__OPTSET(parse_embed_pcdata) /*&& cursor->parent && !cursor->first_child && !cursor->value*/)
{
// cursor->value = s; // Save the offset.
}
else
{
// XSXML__PUSHNODE(node_pcdata); // Append a new node on the tree.
// cursor->value = s; // Save the offset.
// XSXML__POPNODE(); // Pop since this is a standalone.
}
s = strconv_pcdata(s);
if (!*s)
break;
}
else
{
XSXML__SCANFOR(*s == '<'); // '...<'
if (!*s)
break;
handler->xml_text_cb(mark, s - mark);
++s;
}
// We're after '<'
goto LOC_TAG;
}
}
// SAX3: TODO: check that last tag is closed,
// if (cursor != root) XSXML__THROW_ERROR(status_end_element_mismatch, s);
return s;
}
static char_t* parse_skip_bom(char_t* s)
{
return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
}
/*static bool has_element_node_siblings(xml_node_struct* node)
{
while (node)
{
if (XSXML__NODETYPE(node) == node_element) return true;
node = node->next_sibling;
}
return false;
}*/
static xml_parse_result parse(char_t* buffer, size_t length, xml_sax3_parse_cb* handler,
unsigned int optmsk = parse_default)
{
// early-out for empty documents
if (length == 0)
return make_parse_result(XSXML__OPTSET(parse_fragment) ? status_ok
: status_no_document_element);
// get last child of the root before parsing
// xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0
// : 0;
// create parser on stack
xml_sax3_parser parser(handler);
// save last character and make buffer zero-terminated (speeds up parsing)
char_t endch = buffer[length - 1];
buffer[length - 1] = 0;
// skip BOM to make sure it does not end up as part of parse output
char_t* buffer_data = parse_skip_bom(buffer);
// perform actual parsing
parser.parse_tree(buffer_data, optmsk, endch);
xml_parse_result result = make_parse_result(
parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
if (result)
{
// since we removed last character, we have to handle the only possible false positive (stray
// <)
if (endch == '<')
return make_parse_result(status_unrecognized_tag, length - 1);
}
else
{
// roll back offset if it occurs on a null terminator in the source buffer
if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
result.offset--;
}
return result;
}
}; /* xml_sax3_parser */
}; // namespace xsxml
#endif