diff options
author | Alexey Yerin <yyp@disroot.org> | 2023-10-04 21:02:27 +0300 |
---|---|---|
committer | Alexey Yerin <yyp@disroot.org> | 2023-10-04 21:02:27 +0300 |
commit | c98da05caa1c30f42bd5a3615d9cb63b4d590e9f (patch) | |
tree | d019e8c1efd4c63ca38700cf16f59f1f8db05407 | |
parent | c137ce659bd2449c430c0a5ac7f932f2286c250c (diff) |
Vendor hare-fastxml
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | format/fastxml/README | 16 | ||||
-rw-r--r-- | format/fastxml/chars.ha | 32 | ||||
-rw-r--r-- | format/fastxml/parser.ha | 616 | ||||
-rw-r--r-- | format/fastxml/types.ha | 67 |
5 files changed, 731 insertions, 2 deletions
diff --git a/README.md b/README.md index e59fc47..757bd42 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,6 @@ GTK family of libraries (GTK+, GDK, Pango, etc). ## Generating and installing bindings Dependencies: * An up-to-date [Hare] toolchain -* [hare-fastxml] * Development files for GObject-Introspection, the desired GTK version and its dependencies. Make sure `/usr/share/gir-1.0` is included. @@ -43,6 +42,5 @@ $ git config format.subjectPrefix "PATCH hare-gi" ``` [Hare]: https://harelang.org/installation -[hare-fastxml]: https://git.sr.ht/~yerinalexey/hare-fastxml [archives]: https://lists.sr.ht/~yerinalexey/public-inbox [git send-email]: https://git-send-email.io diff --git a/format/fastxml/README b/format/fastxml/README new file mode 100644 index 0000000..4913883 --- /dev/null +++ b/format/fastxml/README @@ -0,0 +1,16 @@ +format::fastxml provides a simple parser of the useful subset of the XML 1.0 +(Fifth Edition) specification as defined by the W3C. Features omitted are: + +- Support for user-defined entities +- Support for UTF-16 inputs or a UTF-8 BOM +- Any considerations for the "Namespaces in XML 1.0" specification + +Attempting to parse an input file which does not conform to the supported subset +of XML will return a syntax error. The purpose of this module is to support most +XML files found in the wild, without supporting the lesser-used features that +lead to problems like "billion laughs" vulnerabilities. If a fully conformant +XML parser is required for your application, you will need to use a third-party +XML implementation. Such an implementation should be able to shadow the standard +library version and present a compatible API. + +The API of this module is compatible with format::xml. diff --git a/format/fastxml/chars.ha b/format/fastxml/chars.ha new file mode 100644 index 0000000..2a2cc90 --- /dev/null +++ b/format/fastxml/chars.ha @@ -0,0 +1,32 @@ +// License: MPL-2.0 +// (c) 2021 Drew DeVault <sir@cmpwn.com> +use ascii; + +fn isnamestart(rn: rune) bool = { + if (rn == ':' || rn == '_' || ascii::isalpha(rn)) return true; + let rn = rn: u32; + return + (rn >= 0xC0 && rn <= 0xD6) || + (rn >= 0xD8 && rn <= 0xF6) || + (rn >= 0xF8 && rn <= 0x2FF) || + (rn >= 0x370 && rn <= 0x37D) || + (rn >= 0x37F && rn <= 0x1FFF) || + (rn >= 0x200C && rn <= 0x200D) || + (rn >= 0x2070 && rn <= 0x218F) || + (rn >= 0x2C00 && rn <= 0x2FEF) || + (rn >= 0x3001 && rn <= 0xD7FF) || + (rn >= 0xF900 && rn <= 0xFDCF) || + (rn >= 0xFDF0 && rn <= 0xFFFD) || + (rn >= 0x10000 && rn <= 0xEFFFF); +}; + +fn isname(rn: rune) bool = { + if (isnamestart(rn) || rn == '-' || rn == '.' || ascii::isdigit(rn)) { + return true; + }; + let rn = rn: u32; + return + (rn == 0xB7) || + (rn >= 0x300 && rn <= 0x36F) || + (rn >= 0x203F && rn <= 0x2040); +}; diff --git a/format/fastxml/parser.ha b/format/fastxml/parser.ha new file mode 100644 index 0000000..8f7705a --- /dev/null +++ b/format/fastxml/parser.ha @@ -0,0 +1,616 @@ +// License: MPL-2.0 +// (c) 2022 Alexey Yerin <yyp@disroot.org> +// (c) 2022 Chris Palmer <chris@red-oxide.org> +// (c) 2021 Drew DeVault <sir@cmpwn.com> +// (c) 2021 Eyal Sawady <ecs@d2evs.net> +// (c) 2022 Julian Hurst <ark@mansus.space> +// (c) 2022 Sebastian <sebastian@sebsite.pw> + +// Are you an intrepid programmer seeking to fork this module to create a more +// sophisticated XML parser supporting a broader set of features? Good news: all +// of the features you need to implement are annotated throughout with +// "XXX: Deliberate ommission" comments. +use ascii; +use encoding::utf8; +use io; +use memio; +use strconv; +use strings; +use types; + +// Creates an XML parser. The caller must call [[parser_free]] when they are +// finished with it. +// +// Hare's XML parser only supports UTF-8 encoded input files. +// +// This function will attempt to read the XML prologue before returning, and +// will return an error if it is not valid. +export fn parse(in: io::file) (*parser | error) = { + const length = io::seek(in, 0, io::whence::END)?: size; + const mapped = io::mmap(null, length, io::prot::READ, + io::mflag::PRIVATE, in, 0)?: *[*]u8; + + // XXX: alloc for API compatibility + let par = alloc(parser { + buf = mapped[..length], + unread = types::RUNE_MAX, + namebuf = memio::dynamic(), + entbuf = memio::dynamic(), + textbuf = memio::dynamic(), + line = 1, + ... + }); + match (prolog(par)) { + case void => void; + case let err: error => + parser_free(par); + return err; + }; + return par; +}; + +// Frees the resources associated with this parser. Does not close the +// underlying I/O handle. +export fn parser_free(par: *parser) void = { + io::munmap(par.buf: *[*]u8, len(par.buf))!; + io::close(&par.namebuf)!; + io::close(&par.entbuf)!; + io::close(&par.textbuf)!; + for (let i = 0z; i < len(par.tags); i += 1) { + free(par.tags[i]); + }; + free(par.tags); + free(par); +}; + +// Scans for and returns the next [[token]]. Tokens are borrowed from the parser +// and are not valid on subsequent calls to [[scan]]; use [[strings::dup]] on +// data you wish to use later. +export fn scan(par: *parser) (token | void | error) = { + switch (par.state) { + case state::ROOT, state::ATTRS => want(par, OPTWS)?; + case => void; + }; + let rn: rune = match (scanrune(par)?) { + case io::EOF => + if (par.state == state::ROOT) { + return par.line: syntaxerr; + } else { + return; + }; + case let rn: rune => + yield rn; + }; + switch (par.state) { + case state::ROOT, state::ELEMENT => + switch (rn) { + case '<' => + const next = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + unreadrune(par, rn); + yield rn; + }; + switch (next) { + case '!' => + return scan_comment(par); + case '?' => + return scan_pi(par); + case => void; + }; + let el = scan_element(par)?; + par.state = state::ATTRS; + return el; + case => + if (par.state == state::ROOT) { + return par.line: syntaxerr; + }; + unreadrune(par, rn); + return scan_content(par)?; + }; + case state::ATTRS => + if (rn == '/') { + want(par, '>')?; + par.state = state::ELEMENT; + return poptag(par, "")?: elementend; + } else if (rn == '>') { + par.state = state::ELEMENT; + return scan(par)?; + } else if (!isnamestart(rn)) { + return par.line: syntaxerr; + }; + unreadrune(par, rn); + return scan_attr(par)?; + }; +}; + +fn poptag(par: *parser, expect: str) (str | error) = { + if (len(par.tags) == 0) { + return par.line: syntaxerr; + }; + let pop = par.tags[len(par.tags) - 1]; + delete(par.tags[len(par.tags) - 1]); + defer free(pop); + if (expect != "" && expect != pop) { + return par.line: syntaxerr; + }; + memio::reset(&par.namebuf); + memio::concat(&par.namebuf, pop)!; + return memio::string(&par.namebuf)!; +}; + +fn scan_attr(par: *parser) (token | error) = { + let name = scan_name(par, &par.namebuf)?; + want(par, OPTWS, '=', OPTWS)?; + let quot = quote(par)?; + memio::reset(&par.textbuf); + for (true) match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + rn = switch (rn) { + case '<' => + return par.line: syntaxerr; + case '&' => + unreadrune(par, rn); + yield scan_entity(par)?; + case '\n' => + par.line += 1; + yield rn; + case => + yield rn; + }; + if (rn == quot) break; + memio::appendrune(&par.textbuf, rn)?; + }; + return (name, memio::string(&par.textbuf)!): attribute; +}; + +fn scan_comment(par: *parser) (token | void | error) = { + want(par, "!")?; + match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + switch (rn) { + case '-' => // Comments + want(par, '-')?; + case '[' => + want(par, "CDATA[")?; + if (par.state != state::ELEMENT) { + return par.line: syntaxerr; + }; + return scan_cdata(par)?; + case => + return par.line: syntaxerr; + }; + }; + for (true) { + const rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + if (rn == '\n') par.line += 1; + yield rn; + }; + if (rn != '-') continue; + const rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + if (rn == '\n') par.line += 1; + yield rn; + }; + if (rn != '-') continue; + const rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + switch (rn) { + case '>' => + break; + case '\n' => + par.line += 1; + case => void; + }; + }; + return scan(par); +}; + +fn scan_cdata(par: *parser) (text | error) = { + memio::reset(&par.textbuf); + for (true) { + const rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + if (rn != ']') { + if (rn == '\n') par.line += 1; + memio::appendrune(&par.textbuf, rn)!; + continue; + }; + const rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + if (rn != ']') { + if (rn == '\n') par.line += 1; + memio::appendrune(&par.textbuf, rn)!; + continue; + }; + const rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + switch (rn) { + case '>' => + break; + case '\n' => + par.line += 1; + case => void; + }; + memio::appendrune(&par.textbuf, rn)!; + }; + return memio::string(&par.textbuf)!: text; +}; + +fn scan_content(par: *parser) (text | error) = { + memio::reset(&par.textbuf); + for (true) match (scanrune(par)?) { + case io::EOF => + break; + case let rn: rune => + rn = switch (rn) { + case '<' => + unreadrune(par, rn); + break; + case '&' => + unreadrune(par, rn); + yield scan_entity(par)?; + case '\n' => + par.line += 1; + yield rn; + case => + yield rn; + }; + memio::appendrune(&par.textbuf, rn)?; + }; + return memio::string(&par.textbuf)!; +}; + +fn scan_element(par: *parser) (token | error) = { + let close = false; + match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + switch (rn) { + case '/' => + close = true; + case '\n' => + par.line += 1; + unreadrune(par, rn); + case => + unreadrune(par, rn); + }; + }; + let name = scan_name(par, &par.namebuf)?; + if (close) { + poptag(par, name)?; + return name: elementend; + } else { + append(par.tags, strings::dup(name)); + return name: elementstart; + }; +}; + +fn scan_entity(par: *parser) (rune | error) = { + want(par, '&')?; + let rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + switch (rn) { + case '#' => + return scan_charref(par); + case '\n' => + return par.line: syntaxerr; + case => + unreadrune(par, rn); + return scan_namedent(par); + }; +}; + +fn scan_paramentity(par: *parser) (rune | error) = { + want(par, '%')?; + return par.line: syntaxerr; // XXX: Deliberate omission: PEReference +}; + +fn scan_charref(par: *parser) (rune | error) = { + let base = strconv::base::DEC; + match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + if (rn == 'x') { + base = strconv::base::HEX; + } else { + unreadrune(par, rn); + }; + }; + + memio::reset(&par.entbuf); + for (true) { + let rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + if (ascii::isdigit(rn)) { + memio::appendrune(&par.entbuf, rn)?; + } else if (rn == ';') { + break; + } else { + return par.line: syntaxerr; + }; + }; + if (len(memio::string(&par.entbuf)!) == 0) { + return par.line: syntaxerr; + }; + match (strconv::stou32b(memio::string(&par.entbuf)!, base)) { + case let u: u32 => + return u: rune; + case (strconv::invalid | strconv::overflow) => + return par.line: syntaxerr; + }; +}; + +fn scan_namedent(par: *parser) (rune | error) = { + const name = scan_name(par, &par.entbuf)?; + want(par, ';')?; + const map = [ + ("lt", '<'), + ("gt", '>'), + ("amp", '&'), + ("apos", '\''), + ("quot", '"'), + ]; + for (let i = 0z; i < len(map); i += 1) { + if (map[i].0 == name) { + return map[i].1; + }; + }; + // XXX: Deliberate ommission: this only supports the pre-defined + // entities as defined by XML 1.0 (Fifth Edition) section 4.6. + return par.line: syntaxerr; +}; + +fn scan_name(par: *parser, buf: *memio::stream) (str | error) = { + memio::reset(buf); + + const rn = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + if (!isnamestart(rn)) { + return par.line: syntaxerr; + }; + memio::appendrune(buf, rn)!; + + for (true) match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + if (isname(rn)) { + memio::appendrune(buf, rn)!; + } else { + unreadrune(par, rn); + break; + }; + }; + + return memio::string(buf)!; +}; + +fn scan_pi(par: *parser) (void | error) = { + abort(); // TODO: Processor instructions +}; + +fn prolog(par: *parser) (void | error) = { + // XXX: Deliberate omission(s): + // - UTF-8 BOM detection + // - UTF-16 support + want(par, "<?xml", WS)?; + + want(par, "version", OPTWS, '=', OPTWS)?; + let quot = quote(par)?; + want(par, OPTWS, "1.")?; + for (true) match (scanrune(par)?) { + case io::EOF => + break; + case let rn: rune => + if (!ascii::isdigit(rn)) { + unreadrune(par, rn); + break; + }; + }; + want(par, quot)?; + + let hadws = want(par, OPTWS)?; + let encoding = match (scanrune(par)) { + case io::EOF => + yield false; + case let rn: rune => + unreadrune(par, rn); + yield hadws && rn == 'e'; + }; + if (encoding) { + let attr = scan_attr(par)? as attribute; + if (attr.0 != "encoding") { + return par.line: syntaxerr; + }; + // XXX: Deliberate omission: all values other than utf-8 + if (!ascii::validstr(attr.1)) { + return utf8::invalid; + }; + if (ascii::strcasecmp(attr.1, "utf-8") != 0) { + return utf8::invalid; + }; + }; + + let hadws = want(par, OPTWS)?; + let standalone = match (scanrune(par)) { + case io::EOF => + yield false; + case let rn: rune => + unreadrune(par, rn); + yield hadws && rn == 's'; + }; + if (standalone) { + let attr = scan_attr(par)? as attribute; + if (attr.0 != "standalone") { + return par.line: syntaxerr; + }; + // XXX: Deliberate omission: non-standalone documents + if (!ascii::validstr(attr.1)) { + return par.line: syntaxerr; + }; + if (ascii::strcasecmp(attr.1, "yes") != 0) { + return par.line: syntaxerr; + }; + }; + + want(par, OPTWS, "?>", OPTWS)?; + // TODO: Parse doctypedecl & misc + return; +}; + +// Mandatory if true +type whitespace = bool; +def WS: whitespace = true; +def OPTWS: whitespace = false; + +fn quote(par: *parser) (rune | error) = { + match (scanrune(par)?) { + case let rn: rune => + switch (rn) { + case '"', '\'' => + return rn; + case => + return par.line: syntaxerr; + }; + case => + return par.line: syntaxerr; + }; +}; + +fn want(par: *parser, tok: (rune | str | whitespace)...) (bool | error) = { + let hadws = false; + for (let i = 0z; i < len(tok); i += 1) match (tok[i]) { + case let x: rune => + let have = match (scanrune(par)?) { + case io::EOF => + return par.line: syntaxerr; + case let rn: rune => + yield rn; + }; + if (have != x) { + return par.line: syntaxerr; + }; + if (x == '\n') { + par.line += 1; + }; + case let x: str => + let iter = strings::iter(x); + for (true) match (strings::next(&iter)) { + case let rn: rune => + want(par, rn)?; + case void => + break; + }; + case let ws: whitespace => + let n = 0; + for (true; n += 1) match (scanrune(par)?) { + case io::EOF => + break; + case let rn: rune => + if (!ascii::isspace(rn)) { + unreadrune(par, rn); + break; + }; + if (rn == '\n') { + par.line += 1; + }; + }; + if (ws && n < 1) { + return par.line: syntaxerr; + }; + hadws = n >= 1; + }; + return hadws; +}; + +fn scanrune(par: *parser) (rune | io::EOF | error) = { + if (par.unread != types::RUNE_MAX) { + const rn = par.unread; + par.unread = types::RUNE_MAX; + return rn; + }; + + let b: [4]u8 = [0...]; + + if (par.cursor >= len(par.buf)) { + return io::EOF; + }; + b[0] = par.buf[par.cursor]; + par.cursor += 1; + + const sz = match (utf8::utf8sz(b[0])) { + case let z: size => + yield z; + case => + return utf8::invalid; + }; + + if (sz == 1) { + return b[0]: u32: rune; + }; + + if (par.cursor - 1 + sz >= len(par.buf)) { + return utf8::invalid; + }; + // avoid memcpy call overhead + for (let i = 0z; i < sz; i += 1) { + b[1 + i] = par.buf[par.cursor + i]; + }; + par.cursor += sz - 1; + + let dec = utf8::decode(b[..sz]); + match (utf8::next(&dec)?) { + case let r: rune => + return r; + case utf8::more => + return utf8::invalid; + case => + return io::EOF; + }; +}; + +fn unreadrune(par: *parser, r: rune) void = { + assert(par.unread == types::RUNE_MAX, "Cannot unread more than 1 rune"); + par.unread = r; +}; diff --git a/format/fastxml/types.ha b/format/fastxml/types.ha new file mode 100644 index 0000000..8dcf785 --- /dev/null +++ b/format/fastxml/types.ha @@ -0,0 +1,67 @@ +// License: MPL-2.0 +// (c) 2022 Alexey Yerin <yyp@disroot.org> +// (c) 2022 Chris Palmer <chris@red-oxide.org> +// (c) 2021 Drew DeVault <sir@cmpwn.com> +// (c) 2021 Eyal Sawady <ecs@d2evs.net> +use encoding::utf8; +use errors; +use fmt; +use io; +use memio; +use os; + +export type parser = struct { + buf: []u8, + cursor: size, + unread: rune, + state: state, + tags: []str, + line: size, + + // memio buffers: + namebuf: memio::stream, + entbuf: memio::stream, + textbuf: memio::stream, +}; + +export type state = enum { + ROOT, + ELEMENT, + ATTRS, +}; + +// The start of an XML element, e.g. <example +export type elementstart = str; + +// The end of an XML element, e.g. /> or </example> +export type elementend = str; + +// An attribute of an XML element, e.g. foo="bar" +export type attribute = (str, str); + +// Text content of an XML element, e.g. baz or <![CDATA[baz]]> +export type text = str; + +// Any valid XML token +export type token = (elementstart | elementend | attribute | text); + +// A syntax error was encountered in the document. +export type syntaxerr = !size; + +// Any error which can occur during XML parsing. +export type error = !(syntaxerr | utf8::invalid | io::error | errors::error); + +// Converts an [[error]] to a user-friendly string representation. +export fn strerror(err: error) const str = { + static let buf: [2048]u8 = [0...]; + match (err) { + case let err: syntaxerr => + return fmt::bsprintf(buf, "Syntax error on line {}", err: size); + case utf8::invalid => + return "Document is not valid UTF-8"; + case let err: io::error => + return io::strerror(err); + case let err: errors::error => + return errors::strerror(err); + }; +}; |