mirror of https://github.com/axmolengine/axmol.git
1678 lines
50 KiB
C++
1678 lines
50 KiB
C++
//////////////////////////////////////////////////////////////////////////////////////////
|
|
// The embedded xml SAX parser, extract from pugixml DOM parser
|
|
// please see: https://github.com/zeux/pugixml
|
|
//////////////////////////////////////////////////////////////////////////////////////////
|
|
/*
|
|
The MIT License (MIT)
|
|
Copyright (c) 2019 halx99
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in all
|
|
copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
SOFTWARE.
|
|
*/
|
|
#ifndef SIMDSOFT__XSXML_HPP
|
|
#define SIMDSOFT__XSXML_HPP
|
|
#pragma once
|
|
#include <functional>
|
|
#include <string>
|
|
#include <assert.h>
|
|
|
|
#define XSXML__DECL inline
|
|
|
|
namespace xsxml
|
|
{
|
|
|
|
typedef char char_t;
|
|
|
|
// Parsing status, returned as part of xml_parse_result object
|
|
enum xml_parse_status
|
|
{
|
|
status_ok = 0, // No error
|
|
|
|
status_file_not_found, // File was not found during load_file()
|
|
status_io_error, // Error reading from file/stream
|
|
status_out_of_memory, // Could not allocate memory
|
|
status_internal_error, // Internal error occurred
|
|
|
|
status_unrecognized_tag, // Parser could not determine tag type
|
|
|
|
status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction
|
|
status_bad_comment, // Parsing error occurred while parsing comment
|
|
status_bad_cdata, // Parsing error occurred while parsing CDATA section
|
|
status_bad_doctype, // Parsing error occurred while parsing document type declaration
|
|
status_bad_pcdata, // Parsing error occurred while parsing PCDATA section
|
|
status_bad_start_element, // Parsing error occurred while parsing start element tag
|
|
status_bad_attribute, // Parsing error occurred while parsing element attribute
|
|
status_bad_end_element, // Parsing error occurred while parsing end element tag
|
|
status_end_element_mismatch, // There was a mismatch of start-end tags (closing tag had incorrect
|
|
// name, some tag was not closed or there was an excessive closing
|
|
// tag)
|
|
|
|
status_append_invalid_root, // Unable to append nodes since root type is not node_element or
|
|
// node_document (exclusive to xml_node::append_buffer)
|
|
|
|
status_no_document_element // Parsing resulted in a document without element nodes
|
|
};
|
|
|
|
// Parsing options
|
|
|
|
// Minimal parsing mode (equivalent to turning all other flags off).
|
|
// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
|
|
const unsigned int parse_minimal = 0x0000;
|
|
|
|
// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is
|
|
// off by default.
|
|
const unsigned int parse_pi = 0x0001;
|
|
|
|
// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by
|
|
// default.
|
|
const unsigned int parse_comments = 0x0002;
|
|
|
|
// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by
|
|
// default.
|
|
const unsigned int parse_cdata = 0x0004;
|
|
|
|
// This flag determines if plain character data (node_pcdata) that consist only of whitespace are
|
|
// added to the DOM tree. This flag is off by default; turning it on usually results in slower
|
|
// parsing and more memory consumption.
|
|
const unsigned int parse_ws_pcdata = 0x0008;
|
|
|
|
// This flag determines if character and entity references are expanded during parsing. This flag is
|
|
// on by default.
|
|
const unsigned int parse_escapes = 0x0010;
|
|
|
|
// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This
|
|
// flag is on by default.
|
|
const unsigned int parse_eol = 0x0020;
|
|
|
|
// This flag determines if attribute values are normalized using CDATA normalization rules during
|
|
// parsing. This flag is on by default.
|
|
const unsigned int parse_wconv_attribute = 0x0040;
|
|
|
|
// This flag determines if attribute values are normalized using NMTOKENS normalization rules during
|
|
// parsing. This flag is off by default.
|
|
const unsigned int parse_wnorm_attribute = 0x0080;
|
|
|
|
// This flag determines if document declaration (node_declaration) is added to the DOM tree. This
|
|
// flag is off by default.
|
|
const unsigned int parse_declaration = 0x0100;
|
|
|
|
// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This
|
|
// flag is off by default.
|
|
const unsigned int parse_doctype = 0x0200;
|
|
|
|
// This flag determines if plain character data (node_pcdata) that is the only child of the parent
|
|
// node and that consists only of whitespace is added to the DOM tree. This flag is off by default;
|
|
// turning it on may result in slower parsing and more memory consumption.
|
|
const unsigned int parse_ws_pcdata_single = 0x0400;
|
|
|
|
// This flag determines if leading and trailing whitespace is to be removed from plain character
|
|
// data. This flag is off by default.
|
|
const unsigned int parse_trim_pcdata = 0x0800;
|
|
|
|
// This flag determines if plain character data that does not have a parent node is added to the DOM
|
|
// tree, and if an empty document is a valid document. This flag is off by default.
|
|
const unsigned int parse_fragment = 0x1000;
|
|
|
|
// This flag determines if plain character data is be stored in the parent element's value. This
|
|
// significantly changes the structure of the document; this flag is only recommended for parsing
|
|
// documents with many PCDATA nodes in memory-constrained environments. This flag is off by default.
|
|
const unsigned int parse_embed_pcdata = 0x2000;
|
|
|
|
// The default parsing mode.
|
|
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are
|
|
// expanded, End-of-Line characters are normalized, attribute values are normalized using CDATA
|
|
// normalization rules.
|
|
const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
|
|
|
|
// The full parsing mode.
|
|
// Nodes of all types are added to the DOM tree, character/reference entities are expanded,
|
|
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization
|
|
// rules.
|
|
const unsigned int parse_full =
|
|
parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
|
|
|
|
// The max parse deep of xml
|
|
// Don't define it to large, otherwise, will lead stack overflow
|
|
const unsigned int parse_max_deep = 512;
|
|
|
|
typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
|
|
typedef char_t* (*strconv_pcdata_t)(char_t*);
|
|
|
|
enum chartype_t
|
|
{
|
|
ct_parse_pcdata = 1, // \0, &, \r, <
|
|
ct_parse_attr = 2, // \0, &, \r, ', "
|
|
ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab
|
|
ct_space = 8, // \r, \n, space, tab
|
|
ct_parse_cdata = 16, // \0, ], >, \r
|
|
ct_parse_comment = 32, // \0, -, >, \r
|
|
ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
|
|
ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, :
|
|
};
|
|
|
|
static const unsigned char chartype_table[256] = {
|
|
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
|
|
8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
|
|
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63
|
|
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95
|
|
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127
|
|
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192,
|
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192};
|
|
|
|
enum chartypex_t
|
|
{
|
|
ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
|
|
ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, "
|
|
ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _
|
|
ctx_digit = 8, // 0-9
|
|
ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
|
|
};
|
|
|
|
static const unsigned char chartypex_table[256] = {
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31
|
|
0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47
|
|
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63
|
|
|
|
0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95
|
|
0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127
|
|
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
|
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
|
|
|
|
// Branch weight controls
|
|
#if defined(__GNUC__)
|
|
# define XSXML__UNLIKELY(cond) __builtin_expect(cond, 0)
|
|
#else
|
|
# define XSXML__UNLIKELY(cond) (cond)
|
|
#endif
|
|
|
|
#define XSXML__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
|
|
|
|
#define XSXML__IS_CHARTYPE(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartype_table)
|
|
#define XSXML__IS_CHARTYPEX(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
|
|
|
|
// Parser utilities
|
|
#define XSXML__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
|
|
#define XSXML__SKIPWS() \
|
|
{ \
|
|
while (XSXML__IS_CHARTYPE(*s, ct_space)) \
|
|
++s; \
|
|
}
|
|
#define XSXML__OPTSET(OPT) (optmsk & (OPT))
|
|
#define XSXML__PUSHNODE(TYPE) \
|
|
{ \
|
|
cursor = append_new_node(cursor, alloc, TYPE); \
|
|
if (!cursor) \
|
|
XSXML__THROW_ERROR(status_out_of_memory, s); \
|
|
}
|
|
#define XSXML__POPNODE() \
|
|
{ \
|
|
cursor = cursor->parent; \
|
|
}
|
|
#define XSXML__SCANFOR(X) \
|
|
{ \
|
|
while (*s != 0 && !(X)) \
|
|
++s; \
|
|
}
|
|
#define XSXML__SCANWHILE(X) \
|
|
{ \
|
|
while (X) \
|
|
++s; \
|
|
}
|
|
#define XSXML__SCANWHILE_UNROLL(X) \
|
|
{ \
|
|
for (;;) \
|
|
{ \
|
|
char_t ss = s[0]; \
|
|
if (XSXML__UNLIKELY(!(X))) \
|
|
{ \
|
|
break; \
|
|
} \
|
|
ss = s[1]; \
|
|
if (XSXML__UNLIKELY(!(X))) \
|
|
{ \
|
|
s += 1; \
|
|
break; \
|
|
} \
|
|
ss = s[2]; \
|
|
if (XSXML__UNLIKELY(!(X))) \
|
|
{ \
|
|
s += 2; \
|
|
break; \
|
|
} \
|
|
ss = s[3]; \
|
|
if (XSXML__UNLIKELY(!(X))) \
|
|
{ \
|
|
s += 3; \
|
|
break; \
|
|
} \
|
|
s += 4; \
|
|
} \
|
|
}
|
|
#define XSXML__ENDSEG() \
|
|
{ \
|
|
ch = *s; \
|
|
*s = 0; \
|
|
++s; \
|
|
}
|
|
#define XSXML__THROW_ERROR(err, m) \
|
|
return error_offset = m, error_status = err, static_cast<char_t*>(0)
|
|
#define XSXML__CHECK_ERROR(err, m) \
|
|
{ \
|
|
if (*s == 0) \
|
|
XSXML__THROW_ERROR(err, m); \
|
|
}
|
|
|
|
// Simple static assertion
|
|
#define XSXML__STATIC_ASSERT(cond) \
|
|
{ \
|
|
static const char condition_failed[(cond) ? 1 : -1] = {0}; \
|
|
(void)condition_failed[0]; \
|
|
}
|
|
|
|
// Parsing result
|
|
struct xml_parse_result
|
|
{
|
|
// Parsing status (see xml_parse_status)
|
|
xml_parse_status status;
|
|
|
|
// Last parsed offset (in char_t units from start of input data)
|
|
ptrdiff_t offset;
|
|
|
|
// Source document encoding
|
|
// xml_encoding encoding;
|
|
|
|
// Default constructor, initializes object to failed state
|
|
xml_parse_result() : status(status_internal_error), offset(0) {}
|
|
|
|
// Cast to bool operator
|
|
operator bool() const { return status == status_ok; }
|
|
|
|
// Get error description
|
|
const char* description() const;
|
|
};
|
|
|
|
struct opt_false
|
|
{
|
|
enum
|
|
{
|
|
value = 0
|
|
};
|
|
};
|
|
|
|
struct opt_true
|
|
{
|
|
enum
|
|
{
|
|
value = 1
|
|
};
|
|
};
|
|
|
|
struct gap
|
|
{
|
|
char_t* end;
|
|
size_t size;
|
|
|
|
gap() : end(0), size(0) {}
|
|
|
|
// Push new gap, move s count bytes further (skipping the gap).
|
|
// Collapse previous gap.
|
|
void push(char_t*& s, size_t count)
|
|
{
|
|
if (end) // there was a gap already; collapse it
|
|
{
|
|
// Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
|
|
assert(s >= end);
|
|
memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
|
|
}
|
|
|
|
s += count; // end of current gap
|
|
|
|
// "merge" two gaps
|
|
end = s;
|
|
size += count;
|
|
}
|
|
|
|
// Collapse all gaps, return past-the-end pointer
|
|
char_t* flush(char_t* s)
|
|
{
|
|
if (end)
|
|
{
|
|
// Move [old_gap_end, current_pos) to [old_gap_start, ...)
|
|
assert(s >= end);
|
|
memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
|
|
|
|
return s - size;
|
|
}
|
|
else
|
|
return s;
|
|
}
|
|
};
|
|
|
|
template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
|
|
{
|
|
static char_t* parse(char_t* s)
|
|
{
|
|
gap g;
|
|
|
|
char_t* begin = s;
|
|
|
|
while (true)
|
|
{
|
|
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_pcdata));
|
|
|
|
if (*s == '<') // PCDATA ends here
|
|
{
|
|
char_t* end = g.flush(s);
|
|
|
|
if (opt_trim::value)
|
|
while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space))
|
|
--end;
|
|
|
|
*end = 0;
|
|
|
|
return s + 1;
|
|
}
|
|
else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
|
|
{
|
|
*s++ = '\n'; // replace first one with 0x0a
|
|
|
|
if (*s == '\n')
|
|
g.push(s, 1);
|
|
}
|
|
else if (opt_escape::value && *s == '&')
|
|
{
|
|
s = strconv_escape(s, g);
|
|
}
|
|
else if (*s == 0)
|
|
{
|
|
char_t* end = g.flush(s);
|
|
|
|
if (opt_trim::value)
|
|
while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space))
|
|
--end;
|
|
|
|
*end = 0;
|
|
|
|
return s;
|
|
}
|
|
else
|
|
++s;
|
|
}
|
|
}
|
|
};
|
|
|
|
XSXML__DECL strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
|
|
{
|
|
XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
|
|
|
|
switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
|
|
{
|
|
case 0:
|
|
return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
|
|
case 1:
|
|
return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
|
|
case 2:
|
|
return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
|
|
case 3:
|
|
return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
|
|
case 4:
|
|
return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
|
|
case 5:
|
|
return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
|
|
case 6:
|
|
return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
|
|
case 7:
|
|
return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
|
|
default:
|
|
assert(false);
|
|
return 0; // should not get here
|
|
}
|
|
}
|
|
|
|
struct utf8_writer
|
|
{
|
|
typedef uint8_t* value_type;
|
|
|
|
static value_type low(value_type result, uint32_t ch)
|
|
{
|
|
// U+0000..U+007F
|
|
if (ch < 0x80)
|
|
{
|
|
*result = static_cast<uint8_t>(ch);
|
|
return result + 1;
|
|
}
|
|
// U+0080..U+07FF
|
|
else if (ch < 0x800)
|
|
{
|
|
result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
|
|
result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
|
|
return result + 2;
|
|
}
|
|
// U+0800..U+FFFF
|
|
else
|
|
{
|
|
result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
|
|
result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
|
|
result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
|
|
return result + 3;
|
|
}
|
|
}
|
|
|
|
static value_type high(value_type result, uint32_t ch)
|
|
{
|
|
// U+10000..U+10FFFF
|
|
result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
|
|
result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
|
|
result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
|
|
result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
|
|
return result + 4;
|
|
}
|
|
|
|
static value_type any(value_type result, uint32_t ch)
|
|
{
|
|
return (ch < 0x10000) ? low(result, ch) : high(result, ch);
|
|
}
|
|
};
|
|
|
|
XSXML__DECL char_t* strconv_escape(char_t* s, gap& g)
|
|
{
|
|
char_t* stre = s + 1;
|
|
|
|
switch (*stre)
|
|
{
|
|
case '#': // &#...
|
|
{
|
|
unsigned int ucsc = 0;
|
|
|
|
if (stre[1] == 'x') // &#x... (hex code)
|
|
{
|
|
stre += 2;
|
|
|
|
char_t ch = *stre;
|
|
|
|
if (ch == ';')
|
|
return stre;
|
|
|
|
for (;;)
|
|
{
|
|
if (static_cast<unsigned int>(ch - '0') <= 9)
|
|
ucsc = 16 * ucsc + (ch - '0');
|
|
else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
|
|
ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
|
|
else if (ch == ';')
|
|
break;
|
|
else // cancel
|
|
return stre;
|
|
|
|
ch = *++stre;
|
|
}
|
|
|
|
++stre;
|
|
}
|
|
else // &#... (dec code)
|
|
{
|
|
char_t ch = *++stre;
|
|
|
|
if (ch == ';')
|
|
return stre;
|
|
|
|
for (;;)
|
|
{
|
|
if (static_cast<unsigned int>(static_cast<unsigned int>(ch) - '0') <= 9)
|
|
ucsc = 10 * ucsc + (ch - '0');
|
|
else if (ch == ';')
|
|
break;
|
|
else // cancel
|
|
return stre;
|
|
|
|
ch = *++stre;
|
|
}
|
|
|
|
++stre;
|
|
}
|
|
|
|
s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
|
|
|
|
g.push(s, stre - s);
|
|
return stre;
|
|
}
|
|
|
|
case 'a': // &a
|
|
{
|
|
++stre;
|
|
|
|
if (*stre == 'm') // &am
|
|
{
|
|
if (*++stre == 'p' && *++stre == ';') // &
|
|
{
|
|
*s++ = '&';
|
|
++stre;
|
|
|
|
g.push(s, stre - s);
|
|
return stre;
|
|
}
|
|
}
|
|
else if (*stre == 'p') // &ap
|
|
{
|
|
if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // '
|
|
{
|
|
*s++ = '\'';
|
|
++stre;
|
|
|
|
g.push(s, stre - s);
|
|
return stre;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 'g': // &g
|
|
{
|
|
if (*++stre == 't' && *++stre == ';') // >
|
|
{
|
|
*s++ = '>';
|
|
++stre;
|
|
|
|
g.push(s, stre - s);
|
|
return stre;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 'l': // &l
|
|
{
|
|
if (*++stre == 't' && *++stre == ';') // <
|
|
{
|
|
*s++ = '<';
|
|
++stre;
|
|
|
|
g.push(s, stre - s);
|
|
return stre;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 'q': // &q
|
|
{
|
|
if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // "
|
|
{
|
|
*s++ = '"';
|
|
++stre;
|
|
|
|
g.push(s, stre - s);
|
|
return stre;
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return stre;
|
|
}
|
|
|
|
template <typename opt_escape> struct strconv_attribute_impl
|
|
{
|
|
static char_t* parse_wnorm(char_t* s, char_t end_quote)
|
|
{
|
|
gap g;
|
|
|
|
// trim leading whitespaces
|
|
if (XSXML__IS_CHARTYPE(*s, ct_space))
|
|
{
|
|
char_t* str = s;
|
|
|
|
do
|
|
++str;
|
|
while (XSXML__IS_CHARTYPE(*str, ct_space));
|
|
|
|
g.push(s, str - s);
|
|
}
|
|
|
|
while (true)
|
|
{
|
|
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
|
|
|
|
if (*s == end_quote)
|
|
{
|
|
char_t* str = g.flush(s);
|
|
|
|
do
|
|
*str-- = 0;
|
|
while (XSXML__IS_CHARTYPE(*str, ct_space));
|
|
|
|
return s + 1;
|
|
}
|
|
else if (XSXML__IS_CHARTYPE(*s, ct_space))
|
|
{
|
|
*s++ = ' ';
|
|
|
|
if (XSXML__IS_CHARTYPE(*s, ct_space))
|
|
{
|
|
char_t* str = s + 1;
|
|
while (XSXML__IS_CHARTYPE(*str, ct_space))
|
|
++str;
|
|
|
|
g.push(s, str - s);
|
|
}
|
|
}
|
|
else if (opt_escape::value && *s == '&')
|
|
{
|
|
s = strconv_escape(s, g);
|
|
}
|
|
else if (!*s)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
++s;
|
|
}
|
|
}
|
|
|
|
static char_t* parse_wconv(char_t* s, char_t end_quote)
|
|
{
|
|
gap g;
|
|
|
|
while (true)
|
|
{
|
|
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws));
|
|
|
|
if (*s == end_quote)
|
|
{
|
|
*g.flush(s) = 0;
|
|
|
|
return s + 1;
|
|
}
|
|
else if (XSXML__IS_CHARTYPE(*s, ct_space))
|
|
{
|
|
if (*s == '\r')
|
|
{
|
|
*s++ = ' ';
|
|
|
|
if (*s == '\n')
|
|
g.push(s, 1);
|
|
}
|
|
else
|
|
*s++ = ' ';
|
|
}
|
|
else if (opt_escape::value && *s == '&')
|
|
{
|
|
s = strconv_escape(s, g);
|
|
}
|
|
else if (!*s)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
++s;
|
|
}
|
|
}
|
|
|
|
static char_t* parse_eol(char_t* s, char_t end_quote)
|
|
{
|
|
gap g;
|
|
|
|
while (true)
|
|
{
|
|
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr));
|
|
|
|
if (*s == end_quote)
|
|
{
|
|
*g.flush(s) = 0;
|
|
|
|
return s + 1;
|
|
}
|
|
else if (*s == '\r')
|
|
{
|
|
*s++ = '\n';
|
|
|
|
if (*s == '\n')
|
|
g.push(s, 1);
|
|
}
|
|
else if (opt_escape::value && *s == '&')
|
|
{
|
|
s = strconv_escape(s, g);
|
|
}
|
|
else if (!*s)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
++s;
|
|
}
|
|
}
|
|
|
|
static char_t* parse_simple(char_t* s, char_t end_quote)
|
|
{
|
|
gap g;
|
|
|
|
while (true)
|
|
{
|
|
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr));
|
|
|
|
if (*s == end_quote)
|
|
{
|
|
*g.flush(s) = 0;
|
|
|
|
return s + 1;
|
|
}
|
|
else if (opt_escape::value && *s == '&')
|
|
{
|
|
s = strconv_escape(s, g);
|
|
}
|
|
else if (!*s)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
++s;
|
|
}
|
|
}
|
|
};
|
|
|
|
XSXML__DECL xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
|
|
{
|
|
xml_parse_result result;
|
|
result.status = status;
|
|
result.offset = offset;
|
|
|
|
return result;
|
|
}
|
|
|
|
XSXML__DECL char_t* strconv_comment(char_t* s, char_t endch)
|
|
{
|
|
gap g;
|
|
|
|
while (true)
|
|
{
|
|
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_comment));
|
|
|
|
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
|
|
{
|
|
*s++ = '\n'; // replace first one with 0x0a
|
|
|
|
if (*s == '\n')
|
|
g.push(s, 1);
|
|
}
|
|
else if (s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>')) // comment ends here
|
|
{
|
|
*g.flush(s) = 0;
|
|
|
|
return s + (s[2] == '>' ? 3 : 2);
|
|
}
|
|
else if (*s == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
++s;
|
|
}
|
|
}
|
|
|
|
XSXML__DECL char_t* strconv_cdata(char_t* s, char_t endch)
|
|
{
|
|
gap g;
|
|
|
|
while (true)
|
|
{
|
|
XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_cdata));
|
|
|
|
if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
|
|
{
|
|
*s++ = '\n'; // replace first one with 0x0a
|
|
|
|
if (*s == '\n')
|
|
g.push(s, 1);
|
|
}
|
|
else if (s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')) // CDATA ends here
|
|
{
|
|
*g.flush(s) = 0;
|
|
|
|
return s + 1;
|
|
}
|
|
else if (*s == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
++s;
|
|
}
|
|
}
|
|
|
|
XSXML__DECL strconv_attribute_t get_strconv_attribute(unsigned int optmask)
|
|
{
|
|
XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 &&
|
|
parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
|
|
|
|
switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
|
|
{
|
|
case 0:
|
|
return strconv_attribute_impl<opt_false>::parse_simple;
|
|
case 1:
|
|
return strconv_attribute_impl<opt_true>::parse_simple;
|
|
case 2:
|
|
return strconv_attribute_impl<opt_false>::parse_eol;
|
|
case 3:
|
|
return strconv_attribute_impl<opt_true>::parse_eol;
|
|
case 4:
|
|
return strconv_attribute_impl<opt_false>::parse_wconv;
|
|
case 5:
|
|
return strconv_attribute_impl<opt_true>::parse_wconv;
|
|
case 6:
|
|
return strconv_attribute_impl<opt_false>::parse_wconv;
|
|
case 7:
|
|
return strconv_attribute_impl<opt_true>::parse_wconv;
|
|
case 8:
|
|
return strconv_attribute_impl<opt_false>::parse_wnorm;
|
|
case 9:
|
|
return strconv_attribute_impl<opt_true>::parse_wnorm;
|
|
case 10:
|
|
return strconv_attribute_impl<opt_false>::parse_wnorm;
|
|
case 11:
|
|
return strconv_attribute_impl<opt_true>::parse_wnorm;
|
|
case 12:
|
|
return strconv_attribute_impl<opt_false>::parse_wnorm;
|
|
case 13:
|
|
return strconv_attribute_impl<opt_true>::parse_wnorm;
|
|
case 14:
|
|
return strconv_attribute_impl<opt_false>::parse_wnorm;
|
|
case 15:
|
|
return strconv_attribute_impl<opt_true>::parse_wnorm;
|
|
default:
|
|
assert(false);
|
|
return 0; // should not get here
|
|
}
|
|
}
|
|
|
|
// Skip utf-8 bom
|
|
static char_t* parse_skip_bom(char_t* s)
|
|
{
|
|
return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
|
|
}
|
|
|
|
// Simple string view
|
|
class string_view
|
|
{
|
|
public:
|
|
string_view() : _Mystr(nullptr), _Mysize(0) {}
|
|
string_view(char_t* str, size_t size) : _Mystr(str), _Mysize(size) {}
|
|
const char* c_str() const { return _Mystr != nullptr ? _Mystr : ""; }
|
|
size_t length() const { return _Mysize; }
|
|
bool empty() const { return _Mysize == 0; }
|
|
|
|
private:
|
|
char_t* _Mystr;
|
|
size_t _Mysize;
|
|
};
|
|
|
|
// The sax3 parse callbacks
|
|
struct xml_sax3_parse_cb
|
|
{
|
|
std::function<void(char* name, size_t)> xml_start_element_cb;
|
|
std::function<void(const char* name, size_t, const char* value, size_t)> xml_attr_cb;
|
|
std::function<void()> xml_end_attr_cb;
|
|
std::function<void(const char* name, size_t)> xml_end_element_cb;
|
|
std::function<void(const char* text, size_t len)> xml_text_cb;
|
|
};
|
|
|
|
/////////////// xml_sax3_parser ///////////
|
|
struct xml_sax3_parser
|
|
{
|
|
// xml_allocator alloc;
|
|
char_t* error_offset;
|
|
xml_parse_status error_status;
|
|
|
|
xml_sax3_parse_cb* handler;
|
|
|
|
xml_sax3_parser(xml_sax3_parse_cb* handler_)
|
|
: handler(handler_), error_offset(0), error_status(status_ok)
|
|
{}
|
|
|
|
~xml_sax3_parser()
|
|
{
|
|
// *alloc_state = alloc;
|
|
}
|
|
|
|
// DOCTYPE consists of nested sections of the following possible types:
|
|
// <!-- ... -->, <? ... ?>, "...", '...'
|
|
// <![...]]>
|
|
// <!...>
|
|
// First group can not contain nested groups
|
|
// Second group can contain nested groups of the same type
|
|
// Third group can contain all other groups
|
|
char_t* parse_doctype_primitive(char_t* s)
|
|
{
|
|
if (*s == '"' || *s == '\'')
|
|
{
|
|
// quoted string
|
|
char_t ch = *s++;
|
|
XSXML__SCANFOR(*s == ch);
|
|
if (!*s)
|
|
XSXML__THROW_ERROR(status_bad_doctype, s);
|
|
|
|
s++;
|
|
}
|
|
else if (s[0] == '<' && s[1] == '?')
|
|
{
|
|
// <? ... ?>
|
|
s += 2;
|
|
XSXML__SCANFOR(s[0] == '?' &&
|
|
s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
|
|
if (!*s)
|
|
XSXML__THROW_ERROR(status_bad_doctype, s);
|
|
|
|
s += 2;
|
|
}
|
|
else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
|
|
{
|
|
s += 4;
|
|
XSXML__SCANFOR(s[0] == '-' && s[1] == '-' &&
|
|
s[2] ==
|
|
'>'); // no need for ENDSWITH because --> can't terminate proper doctype
|
|
if (!*s)
|
|
XSXML__THROW_ERROR(status_bad_doctype, s);
|
|
|
|
s += 3;
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_doctype, s);
|
|
|
|
return s;
|
|
}
|
|
|
|
char_t* parse_doctype_ignore(char_t* s)
|
|
{
|
|
size_t depth = 0;
|
|
|
|
assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
|
|
s += 3;
|
|
|
|
while (*s)
|
|
{
|
|
if (s[0] == '<' && s[1] == '!' && s[2] == '[')
|
|
{
|
|
// nested ignore section
|
|
s += 3;
|
|
depth++;
|
|
}
|
|
else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
|
|
{
|
|
// ignore section end
|
|
s += 3;
|
|
|
|
if (depth == 0)
|
|
return s;
|
|
|
|
depth--;
|
|
}
|
|
else
|
|
s++;
|
|
}
|
|
|
|
XSXML__THROW_ERROR(status_bad_doctype, s);
|
|
}
|
|
|
|
char_t* parse_doctype_group(char_t* s, char_t endch)
|
|
{
|
|
size_t depth = 0;
|
|
|
|
assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
|
|
s += 2;
|
|
|
|
while (*s)
|
|
{
|
|
if (s[0] == '<' && s[1] == '!' && s[2] != '-')
|
|
{
|
|
if (s[2] == '[')
|
|
{
|
|
// ignore
|
|
s = parse_doctype_ignore(s);
|
|
if (!s)
|
|
return s;
|
|
}
|
|
else
|
|
{
|
|
// some control group
|
|
s += 2;
|
|
depth++;
|
|
}
|
|
}
|
|
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
|
|
{
|
|
// unknown tag (forbidden), or some primitive group
|
|
s = parse_doctype_primitive(s);
|
|
if (!s)
|
|
return s;
|
|
}
|
|
else if (*s == '>')
|
|
{
|
|
if (depth == 0)
|
|
return s;
|
|
|
|
depth--;
|
|
s++;
|
|
}
|
|
else
|
|
s++;
|
|
}
|
|
|
|
if (depth != 0 || endch != '>')
|
|
XSXML__THROW_ERROR(status_bad_doctype, s);
|
|
|
|
return s;
|
|
}
|
|
|
|
char_t* parse_exclamation(char_t* s, unsigned int optmsk, char_t endch)
|
|
{
|
|
// parse node contents, starting with exclamation mark
|
|
++s;
|
|
|
|
if (*s == '-') // '<!-...'
|
|
{
|
|
++s;
|
|
|
|
if (*s == '-') // '<!--...'
|
|
{
|
|
++s;
|
|
|
|
char_t* value = nullptr;
|
|
if (XSXML__OPTSET(parse_comments))
|
|
{
|
|
// SAX3: Ignore comment.
|
|
// XSXML__PUSHNODE(node_comment); // Append a new node on the tree.
|
|
// cursor->value = s; // Save the offset.
|
|
value = s;
|
|
}
|
|
|
|
if (XSXML__OPTSET(parse_eol) && XSXML__OPTSET(parse_comments))
|
|
{
|
|
s = strconv_comment(s, endch);
|
|
|
|
if (!s)
|
|
XSXML__THROW_ERROR(status_bad_comment, value);
|
|
}
|
|
else
|
|
{
|
|
// Scan for terminating '-->'.
|
|
XSXML__SCANFOR(s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>'));
|
|
XSXML__CHECK_ERROR(status_bad_comment, s);
|
|
|
|
if (XSXML__OPTSET(parse_comments))
|
|
*s = 0; // Zero-terminate this segment at the first terminating '-'.
|
|
|
|
s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
|
|
}
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_comment, s);
|
|
}
|
|
else if (*s == '[')
|
|
{
|
|
// '<![CDATA[...'
|
|
if (*++s == 'C' && *++s == 'D' && *++s == 'A' && *++s == 'T' && *++s == 'A' && *++s == '[')
|
|
{
|
|
++s;
|
|
|
|
if (XSXML__OPTSET(parse_cdata))
|
|
{
|
|
// SAX3: Ignore CDATA
|
|
// XSXML__PUSHNODE(node_cdata); // Append a new node on the tree.
|
|
auto value = s; // Save the offset.
|
|
|
|
if (XSXML__OPTSET(parse_eol))
|
|
{
|
|
s = strconv_cdata(s, endch);
|
|
|
|
if (!s)
|
|
XSXML__THROW_ERROR(status_bad_cdata, value);
|
|
}
|
|
else
|
|
{
|
|
// Scan for terminating ']]>'.
|
|
XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>'));
|
|
XSXML__CHECK_ERROR(status_bad_cdata, s);
|
|
|
|
*s++ = 0; // Zero-terminate this segment.
|
|
}
|
|
}
|
|
else // Flagged for discard, but we still have to scan for the terminator.
|
|
{
|
|
// Scan for terminating ']]>'.
|
|
XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>'));
|
|
XSXML__CHECK_ERROR(status_bad_cdata, s);
|
|
|
|
++s;
|
|
}
|
|
|
|
s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_cdata, s);
|
|
}
|
|
else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' &&
|
|
s[5] == 'P' && XSXML__ENDSWITH(s[6], 'E'))
|
|
{
|
|
s -= 2;
|
|
|
|
// TODO: check for doctype, parent must be nullptr
|
|
// if (cursor->parent) XSXML__THROW_ERROR(status_bad_doctype, s);
|
|
|
|
char_t* mark = s + 9;
|
|
|
|
s = parse_doctype_group(s, endch);
|
|
if (!s)
|
|
return s;
|
|
|
|
assert((*s == 0 && endch == '>') || *s == '>');
|
|
if (*s)
|
|
*s++ = 0;
|
|
|
|
if (XSXML__OPTSET(parse_doctype))
|
|
{
|
|
while (XSXML__IS_CHARTYPE(*mark, ct_space))
|
|
++mark;
|
|
|
|
// SAX3: Ignore doctype
|
|
// XSXML__PUSHNODE(node_doctype);
|
|
|
|
// cursor->value = mark;
|
|
}
|
|
}
|
|
else if (*s == 0 && endch == '-')
|
|
XSXML__THROW_ERROR(status_bad_comment, s);
|
|
else if (*s == 0 && endch == '[')
|
|
XSXML__THROW_ERROR(status_bad_cdata, s);
|
|
else
|
|
XSXML__THROW_ERROR(status_unrecognized_tag, s);
|
|
|
|
return s;
|
|
}
|
|
|
|
char_t* parse_question(char_t* s, unsigned int optmsk, char_t endch)
|
|
{
|
|
// load into registers
|
|
// xml_node_struct* cursor = ref_cursor;
|
|
char_t ch = 0;
|
|
|
|
// parse node contents, starting with question mark
|
|
++s;
|
|
|
|
// read PI target
|
|
char_t* target = s;
|
|
|
|
if (!XSXML__IS_CHARTYPE(*s, ct_start_symbol))
|
|
XSXML__THROW_ERROR(status_bad_pi, s);
|
|
|
|
XSXML__SCANWHILE(XSXML__IS_CHARTYPE(*s, ct_symbol));
|
|
XSXML__CHECK_ERROR(status_bad_pi, s);
|
|
|
|
// determine node type; stricmp / strcasecmp is not portable
|
|
bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' &&
|
|
(target[2] | ' ') == 'l' && target + 3 == s;
|
|
|
|
if (declaration ? XSXML__OPTSET(parse_declaration) : XSXML__OPTSET(parse_pi))
|
|
{
|
|
if (declaration)
|
|
{
|
|
// TODO: disallow non top-level declarations
|
|
// if (cursor->parent) XSXML__THROW_ERROR(status_bad_pi, s);
|
|
|
|
// SAX3: Ignore declaration.
|
|
// XSXML__PUSHNODE(node_declaration);
|
|
}
|
|
else
|
|
{
|
|
// SAX3: Ignore pi.
|
|
// XSXML__PUSHNODE(node_pi);
|
|
}
|
|
|
|
XSXML__ENDSEG();
|
|
|
|
// parse value/attributes
|
|
if (ch == '?')
|
|
{
|
|
// empty node
|
|
if (!XSXML__ENDSWITH(*s, '>'))
|
|
XSXML__THROW_ERROR(status_bad_pi, s);
|
|
s += (*s == '>');
|
|
|
|
// XSXML__POPNODE();
|
|
}
|
|
else if (XSXML__IS_CHARTYPE(ch, ct_space))
|
|
{
|
|
XSXML__SKIPWS();
|
|
|
|
// scan for tag end
|
|
char_t* value = s;
|
|
|
|
XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>'));
|
|
XSXML__CHECK_ERROR(status_bad_pi, s);
|
|
|
|
if (declaration)
|
|
{
|
|
// replace ending ? with / so that 'element' terminates properly
|
|
*s = '/';
|
|
|
|
// we exit from this function with cursor at node_declaration, which is a signal to
|
|
// parse() to go to LOC_ATTRIBUTES
|
|
s = value;
|
|
}
|
|
else
|
|
{
|
|
// store value and step over >
|
|
// cursor->value = value;
|
|
|
|
// XSXML__POPNODE();
|
|
|
|
XSXML__ENDSEG();
|
|
|
|
s += (*s == '>');
|
|
}
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_pi, s);
|
|
}
|
|
else
|
|
{
|
|
// scan for tag end
|
|
XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>'));
|
|
XSXML__CHECK_ERROR(status_bad_pi, s);
|
|
|
|
s += (s[1] == '>' ? 2 : 1);
|
|
}
|
|
|
|
// store from registers
|
|
// ref_cursor = cursor;
|
|
|
|
return s;
|
|
}
|
|
|
|
template <typename _T, size_t _Capacity> struct fixed_stack
|
|
{
|
|
public:
|
|
fixed_stack() : size_(0) {}
|
|
|
|
void push(const _T& val)
|
|
{
|
|
if (size_ < _Capacity)
|
|
elements_[size_++] = val;
|
|
}
|
|
|
|
_T pop()
|
|
{
|
|
if (size_ > 0)
|
|
return elements_[size_-- - 1];
|
|
return _T{};
|
|
}
|
|
|
|
private:
|
|
_T elements_[_Capacity];
|
|
size_t size_;
|
|
};
|
|
|
|
char_t* parse_tree(char_t* s, unsigned int optmsk, char_t endch)
|
|
{
|
|
strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
|
|
strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
|
|
|
|
char_t ch = 0;
|
|
char_t* mark = s;
|
|
char_t* value = nullptr;
|
|
size_t n = 0;
|
|
|
|
fixed_stack<string_view, parse_max_deep> stk; // 4K on 32bits, 6K on 64bits
|
|
|
|
while (*s != 0)
|
|
{
|
|
if (*s == '<')
|
|
{
|
|
++s;
|
|
|
|
LOC_TAG:
|
|
if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
|
|
{
|
|
// SAX3: TODO: xmlStartElement.
|
|
// XSXML__PUSHNODE(node_element); // Append a new node to the tree.
|
|
|
|
mark = s;
|
|
|
|
XSXML__SCANWHILE_UNROLL(XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
|
|
|
|
handler->xml_start_element_cb(mark, s - mark);
|
|
stk.push(::xsxml::string_view(mark, s - mark));
|
|
|
|
XSXML__ENDSEG(); // Save char in 'ch', terminate & step over.
|
|
|
|
if (ch == '>')
|
|
{
|
|
handler->xml_end_attr_cb(); // end of tag
|
|
}
|
|
else if (XSXML__IS_CHARTYPE(ch, ct_space))
|
|
{
|
|
while (true)
|
|
{ // parse attributes
|
|
XSXML__SKIPWS(); // Eat any whitespace.
|
|
|
|
if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
|
|
{
|
|
// SAX3: TODO: implement attribute.
|
|
// xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for
|
|
// this attribute. if (!a) XSXML__THROW_ERROR(status_out_of_memory, s);
|
|
|
|
mark = s; // Save the offset.
|
|
|
|
XSXML__SCANWHILE_UNROLL(
|
|
XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
|
|
n = s - mark;
|
|
XSXML__ENDSEG(); // Save char in 'ch', terminate & step over.
|
|
|
|
if (XSXML__IS_CHARTYPE(ch, ct_space))
|
|
{
|
|
XSXML__SKIPWS(); // Eat any whitespace.
|
|
|
|
ch = *s;
|
|
++s;
|
|
}
|
|
|
|
if (ch == '=') // '<... #=...'
|
|
{
|
|
XSXML__SKIPWS(); // Eat any whitespace.
|
|
|
|
if (*s == '"' || *s == '\'') // '<... #="...'
|
|
{
|
|
ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
|
|
++s; // Step over the quote.
|
|
value = s; // a->value = s; // Save the offset.
|
|
|
|
s = strconv_attribute(s, ch);
|
|
|
|
if (!s)
|
|
XSXML__THROW_ERROR(status_bad_attribute, value);
|
|
|
|
// After this line the loop continues from the start;
|
|
// Whitespaces, / and > are ok, symbols and EOF are wrong,
|
|
// everything else will be detected
|
|
if (XSXML__IS_CHARTYPE(*s, ct_start_symbol))
|
|
XSXML__THROW_ERROR(status_bad_attribute, s);
|
|
handler->xml_attr_cb(mark, n, value, s - value - 1);
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_attribute, s);
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_attribute, s);
|
|
}
|
|
else if (*s == '/')
|
|
{
|
|
++s;
|
|
if (*s == '>')
|
|
{
|
|
auto ele_name = stk.pop();
|
|
handler->xml_end_attr_cb();
|
|
handler->xml_end_element_cb(ele_name.c_str(), ele_name.length());
|
|
++s;
|
|
break;
|
|
}
|
|
else if (*s == 0 && endch == '>')
|
|
{
|
|
auto ele_name = stk.pop();
|
|
handler->xml_end_attr_cb();
|
|
handler->xml_end_element_cb(ele_name.c_str(), ele_name.length());
|
|
break;
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_start_element, s);
|
|
}
|
|
else if (*s == '>')
|
|
{
|
|
++s;
|
|
handler->xml_end_attr_cb();
|
|
break;
|
|
}
|
|
else if (*s == 0 && endch == '>')
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_start_element, s);
|
|
}
|
|
|
|
// !!!
|
|
}
|
|
else if (ch == '/') // '<#.../'
|
|
{
|
|
if (!XSXML__ENDSWITH(*s, '>'))
|
|
XSXML__THROW_ERROR(status_bad_start_element, s);
|
|
|
|
stk.pop();
|
|
handler->xml_end_element_cb(mark, s - mark);
|
|
s += (*s == '>');
|
|
}
|
|
else if (ch == 0)
|
|
{
|
|
// we stepped over null terminator, backtrack & handle closing tag
|
|
--s;
|
|
|
|
if (endch != '>')
|
|
XSXML__THROW_ERROR(status_bad_start_element, s);
|
|
}
|
|
else
|
|
XSXML__THROW_ERROR(status_bad_start_element, s);
|
|
}
|
|
else if (*s == '/')
|
|
{
|
|
++s;
|
|
|
|
mark = s;
|
|
|
|
// SAX3, we don't check end element name
|
|
while (XSXML__IS_CHARTYPE(*s, ct_symbol))
|
|
++s;
|
|
|
|
stk.pop();
|
|
handler->xml_end_element_cb(mark, s - mark);
|
|
|
|
XSXML__SKIPWS();
|
|
|
|
if (*s == 0)
|
|
{
|
|
if (endch != '>')
|
|
XSXML__THROW_ERROR(status_bad_end_element, s);
|
|
}
|
|
else
|
|
{
|
|
if (*s != '>')
|
|
XSXML__THROW_ERROR(status_bad_end_element, s);
|
|
++s;
|
|
}
|
|
}
|
|
else if (*s == '?') // '<?...'
|
|
{
|
|
// SAX3: TODO: parse question.
|
|
s = parse_question(s, optmsk, endch);
|
|
if (!s)
|
|
return s;
|
|
|
|
// assert(cursor);
|
|
// if (XSXML__NODETYPE(cursor) == node_declaration) goto LOC_ATTRIBUTES;
|
|
// goto LOC_ATTRIBUTES; // SAX3: always regard as a valid node_declaration
|
|
}
|
|
else if (*s == '!') // '<!...'
|
|
{
|
|
// SAX3: TODO: parse exclamation.
|
|
s = parse_exclamation(s, optmsk, endch);
|
|
if (!s)
|
|
return s;
|
|
}
|
|
else if (*s == 0 && endch == '?')
|
|
XSXML__THROW_ERROR(status_bad_pi, s);
|
|
else
|
|
XSXML__THROW_ERROR(status_unrecognized_tag, s);
|
|
}
|
|
else
|
|
{
|
|
mark = s; // Save this offset while searching for a terminator.
|
|
|
|
XSXML__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
|
|
|
|
if (*s == '<' || !*s)
|
|
{
|
|
// We skipped some whitespace characters because otherwise we would take the tag branch
|
|
// instead of PCDATA one
|
|
assert(mark != s);
|
|
|
|
if (!XSXML__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) ||
|
|
XSXML__OPTSET(parse_trim_pcdata))
|
|
{
|
|
continue;
|
|
}
|
|
else if (XSXML__OPTSET(parse_ws_pcdata_single))
|
|
{
|
|
// SAX3: TODO: parse_ws_pcdata_single
|
|
// if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
|
|
}
|
|
}
|
|
|
|
if (!XSXML__OPTSET(parse_trim_pcdata))
|
|
s = mark;
|
|
|
|
// SAX3: Ignore node_pcdata.
|
|
if (/*cursor->parent ||*/ XSXML__OPTSET(parse_fragment))
|
|
{ // Currently, SAX3 simplely skip, do not regard text it node
|
|
if (XSXML__OPTSET(parse_embed_pcdata) /*&& cursor->parent && !cursor->first_child && !cursor->value*/)
|
|
{
|
|
// cursor->value = s; // Save the offset.
|
|
}
|
|
else
|
|
{
|
|
// XSXML__PUSHNODE(node_pcdata); // Append a new node on the tree.
|
|
|
|
// cursor->value = s; // Save the offset.
|
|
|
|
// XSXML__POPNODE(); // Pop since this is a standalone.
|
|
}
|
|
|
|
s = strconv_pcdata(s);
|
|
|
|
if (!*s)
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
XSXML__SCANFOR(*s == '<'); // '...<'
|
|
if (!*s)
|
|
break;
|
|
|
|
handler->xml_text_cb(mark, s - mark);
|
|
|
|
++s;
|
|
}
|
|
|
|
// We're after '<'
|
|
goto LOC_TAG;
|
|
}
|
|
}
|
|
|
|
// SAX3: TODO: check that last tag is closed,
|
|
// if (cursor != root) XSXML__THROW_ERROR(status_end_element_mismatch, s);
|
|
|
|
return s;
|
|
}
|
|
|
|
static char_t* parse_skip_bom(char_t* s)
|
|
{
|
|
return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
|
|
}
|
|
|
|
/*static bool has_element_node_siblings(xml_node_struct* node)
|
|
{
|
|
while (node)
|
|
{
|
|
if (XSXML__NODETYPE(node) == node_element) return true;
|
|
|
|
node = node->next_sibling;
|
|
}
|
|
|
|
return false;
|
|
}*/
|
|
|
|
static xml_parse_result parse(char_t* buffer, size_t length, xml_sax3_parse_cb* handler,
|
|
unsigned int optmsk = parse_default)
|
|
{
|
|
// early-out for empty documents
|
|
if (length == 0)
|
|
return make_parse_result(XSXML__OPTSET(parse_fragment) ? status_ok
|
|
: status_no_document_element);
|
|
|
|
// get last child of the root before parsing
|
|
// xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0
|
|
// : 0;
|
|
|
|
// create parser on stack
|
|
xml_sax3_parser parser(handler);
|
|
|
|
// save last character and make buffer zero-terminated (speeds up parsing)
|
|
char_t endch = buffer[length - 1];
|
|
buffer[length - 1] = 0;
|
|
|
|
// skip BOM to make sure it does not end up as part of parse output
|
|
char_t* buffer_data = parse_skip_bom(buffer);
|
|
|
|
// perform actual parsing
|
|
parser.parse_tree(buffer_data, optmsk, endch);
|
|
|
|
xml_parse_result result = make_parse_result(
|
|
parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
|
|
assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
|
|
|
|
if (result)
|
|
{
|
|
// since we removed last character, we have to handle the only possible false positive (stray
|
|
// <)
|
|
if (endch == '<')
|
|
return make_parse_result(status_unrecognized_tag, length - 1);
|
|
}
|
|
else
|
|
{
|
|
// roll back offset if it occurs on a null terminator in the source buffer
|
|
if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
|
|
result.offset--;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}; /* xml_sax3_parser */
|
|
}; // namespace xsxml
|
|
|
|
#endif
|