// License: MPL-2.0 // (c) 2022 Alexey Yerin // (c) 2022 Chris Palmer // (c) 2021 Drew DeVault // (c) 2021 Eyal Sawady // (c) 2022 Julian Hurst // (c) 2022 Sebastian // Are you an intrepid programmer seeking to fork this module to create a more // sophisticated XML parser supporting a broader set of features? Good news: all // of the features you need to implement are annotated throughout with // "XXX: Deliberate ommission" comments. use ascii; use encoding::utf8; use io; use memio; use strconv; use strings; use types; // Creates an XML parser. The caller must call [[parser_free]] when they are // finished with it. // // Hare's XML parser only supports UTF-8 encoded input files. // // This function will attempt to read the XML prologue before returning, and // will return an error if it is not valid. export fn parse(in: io::file) (*parser | error) = { const length = io::seek(in, 0, io::whence::END)?: size; const mapped = io::mmap(null, length, io::prot::READ, io::mflag::PRIVATE, in, 0)?: *[*]u8; // XXX: alloc for API compatibility let par = alloc(parser { buf = mapped[..length], unread = types::RUNE_MAX, namebuf = memio::dynamic(), entbuf = memio::dynamic(), textbuf = memio::dynamic(), line = 1, ... })?; match (prolog(par)) { case void => void; case let err: error => parser_free(par); return err; }; return par; }; // Frees the resources associated with this parser. Does not close the // underlying I/O handle. export fn parser_free(par: *parser) void = { io::munmap(par.buf: *[*]u8, len(par.buf))!; io::close(&par.namebuf)!; io::close(&par.entbuf)!; io::close(&par.textbuf)!; for (let i = 0z; i < len(par.tags); i += 1) { free(par.tags[i]); }; free(par.tags); free(par); }; // Scans for and returns the next [[token]]. Tokens are borrowed from the parser // and are not valid on subsequent calls to [[scan]]; use [[strings::dup]] on // data you wish to use later. export fn scan(par: *parser) (token | void | error) = { switch (par.state) { case state::ROOT, state::ATTRS => want(par, OPTWS)?; case => void; }; let rn: rune = match (scanrune(par)?) { case io::EOF => if (par.state == state::ROOT) { return par.line: syntaxerr; } else { return; }; case let rn: rune => yield rn; }; switch (par.state) { case state::ROOT, state::ELEMENT => switch (rn) { case '<' => const next = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => unreadrune(par, rn); yield rn; }; switch (next) { case '!' => return scan_comment(par); case '?' => return scan_pi(par); case => void; }; let el = scan_element(par)?; par.state = state::ATTRS; return el; case => if (par.state == state::ROOT) { return par.line: syntaxerr; }; unreadrune(par, rn); return scan_content(par)?; }; case state::ATTRS => if (rn == '/') { want(par, '>')?; par.state = state::ELEMENT; return poptag(par, "")?: elementend; } else if (rn == '>') { par.state = state::ELEMENT; return scan(par)?; } else if (!isnamestart(rn)) { return par.line: syntaxerr; }; unreadrune(par, rn); return scan_attr(par)?; }; }; fn poptag(par: *parser, expect: str) (str | error) = { if (len(par.tags) == 0) { return par.line: syntaxerr; }; let pop = par.tags[len(par.tags) - 1]; delete(par.tags[len(par.tags) - 1]); defer free(pop); if (expect != "" && expect != pop) { return par.line: syntaxerr; }; memio::reset(&par.namebuf); memio::concat(&par.namebuf, pop)!; return memio::string(&par.namebuf)!; }; fn scan_attr(par: *parser) (token | error) = { let name = scan_name(par, &par.namebuf)?; want(par, OPTWS, '=', OPTWS)?; let quot = quote(par)?; memio::reset(&par.textbuf); for (true) match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => rn = switch (rn) { case '<' => return par.line: syntaxerr; case '&' => unreadrune(par, rn); yield scan_entity(par)?; case '\n' => par.line += 1; yield rn; case => yield rn; }; if (rn == quot) break; memio::appendrune(&par.textbuf, rn)?; }; return (name, memio::string(&par.textbuf)!): attribute; }; fn scan_comment(par: *parser) (token | void | error) = { want(par, "!")?; match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => switch (rn) { case '-' => // Comments want(par, '-')?; case '[' => want(par, "CDATA[")?; if (par.state != state::ELEMENT) { return par.line: syntaxerr; }; return scan_cdata(par)?; case => return par.line: syntaxerr; }; }; for (true) { const rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => if (rn == '\n') par.line += 1; yield rn; }; if (rn != '-') continue; const rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => if (rn == '\n') par.line += 1; yield rn; }; if (rn != '-') continue; const rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; switch (rn) { case '>' => break; case '\n' => par.line += 1; case => void; }; }; return scan(par); }; fn scan_cdata(par: *parser) (text | error) = { memio::reset(&par.textbuf); for (true) { const rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; if (rn != ']') { if (rn == '\n') par.line += 1; memio::appendrune(&par.textbuf, rn)!; continue; }; const rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; if (rn != ']') { if (rn == '\n') par.line += 1; memio::appendrune(&par.textbuf, rn)!; continue; }; const rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; switch (rn) { case '>' => break; case '\n' => par.line += 1; case => void; }; memio::appendrune(&par.textbuf, rn)!; }; return memio::string(&par.textbuf)!: text; }; fn scan_content(par: *parser) (text | error) = { memio::reset(&par.textbuf); for (true) match (scanrune(par)?) { case io::EOF => break; case let rn: rune => rn = switch (rn) { case '<' => unreadrune(par, rn); break; case '&' => unreadrune(par, rn); yield scan_entity(par)?; case '\n' => par.line += 1; yield rn; case => yield rn; }; memio::appendrune(&par.textbuf, rn)?; }; return memio::string(&par.textbuf)!; }; fn scan_element(par: *parser) (token | error) = { let close = false; match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => switch (rn) { case '/' => close = true; case '\n' => par.line += 1; unreadrune(par, rn); case => unreadrune(par, rn); }; }; let name = scan_name(par, &par.namebuf)?; if (close) { poptag(par, name)?; return name: elementend; } else { append(par.tags, strings::dup(name)?)?; return name: elementstart; }; }; fn scan_entity(par: *parser) (rune | error) = { want(par, '&')?; let rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; switch (rn) { case '#' => return scan_charref(par); case '\n' => return par.line: syntaxerr; case => unreadrune(par, rn); return scan_namedent(par); }; }; fn scan_paramentity(par: *parser) (rune | error) = { want(par, '%')?; return par.line: syntaxerr; // XXX: Deliberate omission: PEReference }; fn scan_charref(par: *parser) (rune | error) = { let base = strconv::base::DEC; match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => if (rn == 'x') { base = strconv::base::HEX; } else { unreadrune(par, rn); }; }; memio::reset(&par.entbuf); for (true) { let rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; if (ascii::isdigit(rn)) { memio::appendrune(&par.entbuf, rn)?; } else if (rn == ';') { break; } else { return par.line: syntaxerr; }; }; if (len(memio::string(&par.entbuf)!) == 0) { return par.line: syntaxerr; }; match (strconv::stou32(memio::string(&par.entbuf)!, base)) { case let u: u32 => return u: rune; case (strconv::invalid | strconv::overflow) => return par.line: syntaxerr; }; }; fn scan_namedent(par: *parser) (rune | error) = { const name = scan_name(par, &par.entbuf)?; want(par, ';')?; const map = [ ("lt", '<'), ("gt", '>'), ("amp", '&'), ("apos", '\''), ("quot", '"'), ]; for (let i = 0z; i < len(map); i += 1) { if (map[i].0 == name) { return map[i].1; }; }; // XXX: Deliberate ommission: this only supports the pre-defined // entities as defined by XML 1.0 (Fifth Edition) section 4.6. return par.line: syntaxerr; }; fn scan_name(par: *parser, buf: *memio::stream) (str | error) = { memio::reset(buf); const rn = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; if (!isnamestart(rn)) { return par.line: syntaxerr; }; memio::appendrune(buf, rn)!; for (true) match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => if (isname(rn)) { memio::appendrune(buf, rn)!; } else { unreadrune(par, rn); break; }; }; return memio::string(buf)!; }; fn scan_pi(par: *parser) (void | error) = { abort(); // TODO: Processor instructions }; fn prolog(par: *parser) (void | error) = { // XXX: Deliberate omission(s): // - UTF-8 BOM detection // - UTF-16 support want(par, " break; case let rn: rune => if (!ascii::isdigit(rn)) { unreadrune(par, rn); break; }; }; want(par, quot)?; let hadws = want(par, OPTWS)?; let encoding = match (scanrune(par)) { case io::EOF => yield false; case let rn: rune => unreadrune(par, rn); yield hadws && rn == 'e'; }; if (encoding) { let attr = scan_attr(par)? as attribute; if (attr.0 != "encoding") { return par.line: syntaxerr; }; // XXX: Deliberate omission: all values other than utf-8 if (!ascii::validstr(attr.1)) { return utf8::invalid; }; if (ascii::strcasecmp(attr.1, "utf-8") != 0) { return utf8::invalid; }; }; let hadws = want(par, OPTWS)?; let standalone = match (scanrune(par)) { case io::EOF => yield false; case let rn: rune => unreadrune(par, rn); yield hadws && rn == 's'; }; if (standalone) { let attr = scan_attr(par)? as attribute; if (attr.0 != "standalone") { return par.line: syntaxerr; }; // XXX: Deliberate omission: non-standalone documents if (!ascii::validstr(attr.1)) { return par.line: syntaxerr; }; if (ascii::strcasecmp(attr.1, "yes") != 0) { return par.line: syntaxerr; }; }; want(par, OPTWS, "?>", OPTWS)?; // TODO: Parse doctypedecl & misc return; }; // Mandatory if true type whitespace = bool; def WS: whitespace = true; def OPTWS: whitespace = false; fn quote(par: *parser) (rune | error) = { match (scanrune(par)?) { case let rn: rune => switch (rn) { case '"', '\'' => return rn; case => return par.line: syntaxerr; }; case => return par.line: syntaxerr; }; }; fn want(par: *parser, tokens: (rune | str | whitespace)...) (bool | error) = { let hadws = false; for (let tok .. tokens) { match (tok) { case let x: rune => let have = match (scanrune(par)?) { case io::EOF => return par.line: syntaxerr; case let rn: rune => yield rn; }; if (have != x) { return par.line: syntaxerr; }; if (x == '\n') { par.line += 1; }; case let x: str => let iter = strings::iter(x); for (true) match (strings::next(&iter)) { case let rn: rune => want(par, rn)?; case done => break; }; case let ws: whitespace => let n = 0; for (true; n += 1) match (scanrune(par)?) { case io::EOF => break; case let rn: rune => if (!ascii::isspace(rn)) { unreadrune(par, rn); break; }; if (rn == '\n') { par.line += 1; }; }; if (ws && n < 1) { return par.line: syntaxerr; }; hadws = n >= 1; }; }; return hadws; }; fn scanrune(par: *parser) (rune | io::EOF | error) = { if (par.unread != types::RUNE_MAX) { const rn = par.unread; par.unread = types::RUNE_MAX; return rn; }; let b: [4]u8 = [0...]; if (par.cursor >= len(par.buf)) { return io::EOF; }; b[0] = par.buf[par.cursor]; par.cursor += 1; const sz = match (utf8::utf8sz(b[0])) { case let z: size => yield z; case => return utf8::invalid; }; if (sz == 1) { return b[0]: u32: rune; }; if (par.cursor - 1 + sz >= len(par.buf)) { return utf8::invalid; }; // avoid memcpy call overhead for (let i = 0z; i < sz; i += 1) { b[1 + i] = par.buf[par.cursor + i]; }; par.cursor += sz - 1; let dec = utf8::decode(b[..sz]); match (utf8::next(&dec)?) { case let r: rune => return r; case utf8::more => return utf8::invalid; case => return io::EOF; }; }; fn unreadrune(par: *parser, r: rune) void = { assert(par.unread == types::RUNE_MAX, "Cannot unread more than 1 rune"); par.unread = r; };