about summary refs log tree commit diff
path: root/format/fastxml/parser.ha
diff options
context:
space:
mode:
Diffstat (limited to 'format/fastxml/parser.ha')
-rw-r--r--format/fastxml/parser.ha616
1 files changed, 616 insertions, 0 deletions
diff --git a/format/fastxml/parser.ha b/format/fastxml/parser.ha
new file mode 100644
index 0000000..8f7705a
--- /dev/null
+++ b/format/fastxml/parser.ha
@@ -0,0 +1,616 @@
+// License: MPL-2.0
+// (c) 2022 Alexey Yerin <yyp@disroot.org>
+// (c) 2022 Chris Palmer <chris@red-oxide.org>
+// (c) 2021 Drew DeVault <sir@cmpwn.com>
+// (c) 2021 Eyal Sawady <ecs@d2evs.net>
+// (c) 2022 Julian Hurst <ark@mansus.space>
+// (c) 2022 Sebastian <sebastian@sebsite.pw>
+
+// Are you an intrepid programmer seeking to fork this module to create a more
+// sophisticated XML parser supporting a broader set of features? Good news: all
+// of the features you need to implement are annotated throughout with
+// "XXX: Deliberate ommission" comments.
+use ascii;
+use encoding::utf8;
+use io;
+use memio;
+use strconv;
+use strings;
+use types;
+
+// Creates an XML parser. The caller must call [[parser_free]] when they are
+// finished with it.
+//
+// Hare's XML parser only supports UTF-8 encoded input files.
+//
+// This function will attempt to read the XML prologue before returning, and
+// will return an error if it is not valid.
+export fn parse(in: io::file) (*parser | error) = {
+	const length = io::seek(in, 0, io::whence::END)?: size;
+	const mapped = io::mmap(null, length, io::prot::READ,
+		io::mflag::PRIVATE, in, 0)?: *[*]u8;
+
+	// XXX: alloc for API compatibility
+	let par = alloc(parser {
+		buf = mapped[..length],
+		unread = types::RUNE_MAX,
+		namebuf = memio::dynamic(),
+		entbuf = memio::dynamic(),
+		textbuf = memio::dynamic(),
+		line = 1,
+		...
+	});
+	match (prolog(par)) {
+	case void => void;
+	case let err: error =>
+		parser_free(par);
+		return err;
+	};
+	return par;
+};
+
+// Frees the resources associated with this parser. Does not close the
+// underlying I/O handle.
+export fn parser_free(par: *parser) void = {
+	io::munmap(par.buf: *[*]u8, len(par.buf))!;
+	io::close(&par.namebuf)!;
+	io::close(&par.entbuf)!;
+	io::close(&par.textbuf)!;
+	for (let i = 0z; i < len(par.tags); i += 1) {
+		free(par.tags[i]);
+	};
+	free(par.tags);
+	free(par);
+};
+
+// Scans for and returns the next [[token]]. Tokens are borrowed from the parser
+// and are not valid on subsequent calls to [[scan]]; use [[strings::dup]] on
+// data you wish to use later.
+export fn scan(par: *parser) (token | void | error) = {
+	switch (par.state) {
+	case state::ROOT, state::ATTRS => want(par, OPTWS)?;
+	case => void;
+	};
+	let rn: rune = match (scanrune(par)?) {
+	case io::EOF =>
+		if (par.state == state::ROOT) {
+			return par.line: syntaxerr;
+		} else {
+			return;
+		};
+	case let rn: rune =>
+		yield rn;
+	};
+	switch (par.state) {
+	case state::ROOT, state::ELEMENT =>
+		switch (rn) {
+		case '<' =>
+			const next = match (scanrune(par)?) {
+			case io::EOF =>
+				return par.line: syntaxerr;
+			case let rn: rune =>
+				unreadrune(par, rn);
+				yield rn;
+			};
+			switch (next) {
+			case '!' =>
+				return scan_comment(par);
+			case '?' =>
+				return scan_pi(par);
+			case => void;
+			};
+			let el = scan_element(par)?;
+			par.state = state::ATTRS;
+			return el;
+		case =>
+			if (par.state == state::ROOT) {
+				return par.line: syntaxerr;
+			};
+			unreadrune(par, rn);
+			return scan_content(par)?;
+		};
+	case state::ATTRS =>
+		if (rn == '/') {
+			want(par, '>')?;
+			par.state = state::ELEMENT;
+			return poptag(par, "")?: elementend;
+		} else if (rn == '>') {
+			par.state = state::ELEMENT;
+			return scan(par)?;
+		} else if (!isnamestart(rn)) {
+			return par.line: syntaxerr;
+		};
+		unreadrune(par, rn);
+		return scan_attr(par)?;
+	};
+};
+
+fn poptag(par: *parser, expect: str) (str | error) = {
+	if (len(par.tags) == 0) {
+		return par.line: syntaxerr;
+	};
+	let pop = par.tags[len(par.tags) - 1];
+	delete(par.tags[len(par.tags) - 1]);
+	defer free(pop);
+	if (expect != "" && expect != pop) {
+		return par.line: syntaxerr;
+	};
+	memio::reset(&par.namebuf);
+	memio::concat(&par.namebuf, pop)!;
+	return memio::string(&par.namebuf)!;
+};
+
+fn scan_attr(par: *parser) (token | error) = {
+	let name = scan_name(par, &par.namebuf)?;
+	want(par, OPTWS, '=', OPTWS)?;
+	let quot = quote(par)?;
+	memio::reset(&par.textbuf);
+	for (true) match (scanrune(par)?) {
+	case io::EOF =>
+		return par.line: syntaxerr;
+	case let rn: rune =>
+		rn = switch (rn) {
+		case '<' =>
+			return par.line: syntaxerr;
+		case '&' =>
+			unreadrune(par, rn);
+			yield scan_entity(par)?;
+		case '\n' =>
+			par.line += 1;
+			yield rn;
+		case =>
+			yield rn;
+		};
+		if (rn == quot) break;
+		memio::appendrune(&par.textbuf, rn)?;
+	};
+	return (name, memio::string(&par.textbuf)!): attribute;
+};
+
+fn scan_comment(par: *parser) (token | void | error) = {
+	want(par, "!")?;
+	match (scanrune(par)?) {
+	case io::EOF =>
+		return par.line: syntaxerr;
+	case let rn: rune =>
+		switch (rn) {
+		case '-' => // Comments
+			want(par, '-')?;
+		case '[' =>
+			want(par, "CDATA[")?;
+			if (par.state != state::ELEMENT) {
+				return par.line: syntaxerr;
+			};
+			return scan_cdata(par)?;
+		case =>
+			return par.line: syntaxerr;
+		};
+	};
+	for (true) {
+		const rn = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			if (rn == '\n') par.line += 1;
+			yield rn;
+		};
+		if (rn != '-') continue;
+		const rn = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			if (rn == '\n') par.line += 1;
+			yield rn;
+		};
+		if (rn != '-') continue;
+		const rn = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			yield rn;
+		};
+		switch (rn) {
+		case '>' =>
+			break;
+		case '\n' =>
+			par.line += 1;
+		case => void;
+		};
+	};
+	return scan(par);
+};
+
+fn scan_cdata(par: *parser) (text | error) = {
+	memio::reset(&par.textbuf);
+	for (true) {
+		const rn = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			yield rn;
+		};
+		if (rn != ']') {
+			if (rn == '\n') par.line += 1;
+			memio::appendrune(&par.textbuf, rn)!;
+			continue;
+		};
+		const rn = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			yield rn;
+		};
+		if (rn != ']') {
+			if (rn == '\n') par.line += 1;
+			memio::appendrune(&par.textbuf, rn)!;
+			continue;
+		};
+		const rn = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			yield rn;
+		};
+		switch (rn) {
+		case '>' =>
+			break;
+		case '\n' =>
+			par.line += 1;
+		case => void;
+		};
+		memio::appendrune(&par.textbuf, rn)!;
+	};
+	return memio::string(&par.textbuf)!: text;
+};
+
+fn scan_content(par: *parser) (text | error) = {
+	memio::reset(&par.textbuf);
+	for (true) match (scanrune(par)?) {
+	case io::EOF =>
+		break;
+	case let rn: rune =>
+		rn = switch (rn) {
+		case '<' =>
+			unreadrune(par, rn);
+			break;
+		case '&' =>
+			unreadrune(par, rn);
+			yield scan_entity(par)?;
+		case '\n' =>
+			par.line += 1;
+			yield rn;
+		case =>
+			yield rn;
+		};
+		memio::appendrune(&par.textbuf, rn)?;
+	};
+	return memio::string(&par.textbuf)!;
+};
+
+fn scan_element(par: *parser) (token | error) = {
+	let close = false;
+	match (scanrune(par)?) {
+	case io::EOF =>
+		return par.line: syntaxerr;
+	case let rn: rune =>
+		switch (rn) {
+		case '/' =>
+			close = true;
+		case '\n' =>
+			par.line += 1;
+			unreadrune(par, rn);
+		case =>
+			unreadrune(par, rn);
+		};
+	};
+	let name = scan_name(par, &par.namebuf)?;
+	if (close) {
+		poptag(par, name)?;
+		return name: elementend;
+	} else {
+		append(par.tags, strings::dup(name));
+		return name: elementstart;
+	};
+};
+
+fn scan_entity(par: *parser) (rune | error) = {
+	want(par, '&')?;
+	let rn = match (scanrune(par)?) {
+	case io::EOF =>
+		return par.line: syntaxerr;
+	case let rn: rune =>
+		yield rn;
+	};
+	switch (rn) {
+	case '#' =>
+		return scan_charref(par);
+	case '\n' =>
+		return par.line: syntaxerr;
+	case =>
+		unreadrune(par, rn);
+		return scan_namedent(par);
+	};
+};
+
+fn scan_paramentity(par: *parser) (rune | error) = {
+	want(par, '%')?;
+	return par.line: syntaxerr; // XXX: Deliberate omission: PEReference
+};
+
+fn scan_charref(par: *parser) (rune | error) = {
+	let base = strconv::base::DEC;
+	match (scanrune(par)?) {
+	case io::EOF =>
+		return par.line: syntaxerr;
+	case let rn: rune =>
+		if (rn == 'x') {
+			base = strconv::base::HEX;
+		} else {
+			unreadrune(par, rn);
+		};
+	};
+
+	memio::reset(&par.entbuf);
+	for (true) {
+		let rn = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			yield rn;
+		};
+		if (ascii::isdigit(rn)) {
+			memio::appendrune(&par.entbuf, rn)?;
+		} else if (rn == ';') {
+			break;
+		} else {
+			return par.line: syntaxerr;
+		};
+	};
+	if (len(memio::string(&par.entbuf)!) == 0) {
+		return par.line: syntaxerr;
+	};
+	match (strconv::stou32b(memio::string(&par.entbuf)!, base)) {
+	case let u: u32 =>
+		return u: rune;
+	case (strconv::invalid | strconv::overflow) =>
+		return par.line: syntaxerr;
+	};
+};
+
+fn scan_namedent(par: *parser) (rune | error) = {
+	const name = scan_name(par, &par.entbuf)?;
+	want(par, ';')?;
+	const map = [
+		("lt", '<'),
+		("gt", '>'),
+		("amp", '&'),
+		("apos", '\''),
+		("quot", '"'),
+	];
+	for (let i = 0z; i < len(map); i += 1) {
+		if (map[i].0 == name) {
+			return map[i].1;
+		};
+	};
+	// XXX: Deliberate ommission: this only supports the pre-defined
+	// entities as defined by XML 1.0 (Fifth Edition) section 4.6.
+	return par.line: syntaxerr;
+};
+
+fn scan_name(par: *parser, buf: *memio::stream) (str | error) = {
+	memio::reset(buf);
+
+	const rn = match (scanrune(par)?) {
+	case io::EOF =>
+		return par.line: syntaxerr;
+	case let rn: rune =>
+		yield rn;
+	};
+	if (!isnamestart(rn)) {
+		return par.line: syntaxerr;
+	};
+	memio::appendrune(buf, rn)!;
+
+	for (true) match (scanrune(par)?) {
+	case io::EOF =>
+		return par.line: syntaxerr;
+	case let rn: rune =>
+		if (isname(rn)) {
+			memio::appendrune(buf, rn)!;
+		} else {
+			unreadrune(par, rn);
+			break;
+		};
+	};
+
+	return memio::string(buf)!;
+};
+
+fn scan_pi(par: *parser) (void | error) = {
+	abort(); // TODO: Processor instructions
+};
+
+fn prolog(par: *parser) (void | error) = {
+	// XXX: Deliberate omission(s):
+	// - UTF-8 BOM detection
+	// - UTF-16 support
+	want(par, "<?xml", WS)?;
+
+	want(par, "version", OPTWS, '=', OPTWS)?;
+	let quot = quote(par)?;
+	want(par, OPTWS, "1.")?;
+	for (true) match (scanrune(par)?) {
+	case io::EOF =>
+		break;
+	case let rn: rune =>
+		if (!ascii::isdigit(rn)) {
+			unreadrune(par, rn);
+			break;
+		};
+	};
+	want(par, quot)?;
+
+	let hadws = want(par, OPTWS)?;
+	let encoding = match (scanrune(par)) {
+	case io::EOF =>
+		yield false;
+	case let rn: rune =>
+		unreadrune(par, rn);
+		yield hadws && rn == 'e';
+	};
+	if (encoding) {
+		let attr = scan_attr(par)? as attribute;
+		if (attr.0 != "encoding") {
+			return par.line: syntaxerr;
+		};
+		// XXX: Deliberate omission: all values other than utf-8
+		if (!ascii::validstr(attr.1)) {
+			return utf8::invalid;
+		};
+		if (ascii::strcasecmp(attr.1, "utf-8") != 0) {
+			return utf8::invalid;
+		};
+	};
+
+	let hadws = want(par, OPTWS)?;
+	let standalone = match (scanrune(par)) {
+	case io::EOF =>
+		yield false;
+	case let rn: rune =>
+		unreadrune(par, rn);
+		yield hadws && rn == 's';
+	};
+	if (standalone) {
+		let attr = scan_attr(par)? as attribute;
+		if (attr.0 != "standalone") {
+			return par.line: syntaxerr;
+		};
+		// XXX: Deliberate omission: non-standalone documents
+		if (!ascii::validstr(attr.1)) {
+			return par.line: syntaxerr;
+		};
+		if (ascii::strcasecmp(attr.1, "yes") != 0) {
+			return par.line: syntaxerr;
+		};
+	};
+
+	want(par, OPTWS, "?>", OPTWS)?;
+	// TODO: Parse doctypedecl & misc
+	return;
+};
+
+// Mandatory if true
+type whitespace = bool;
+def WS: whitespace = true;
+def OPTWS: whitespace = false;
+
+fn quote(par: *parser) (rune | error) = {
+	match (scanrune(par)?) {
+	case let rn: rune =>
+		switch (rn) {
+		case '"', '\'' =>
+			return rn;
+		case =>
+			return par.line: syntaxerr;
+		};
+	case =>
+		return par.line: syntaxerr;
+	};
+};
+
+fn want(par: *parser, tok: (rune | str | whitespace)...) (bool | error) = {
+	let hadws = false;
+	for (let i = 0z; i < len(tok); i += 1) match (tok[i]) {
+	case let x: rune =>
+		let have = match (scanrune(par)?) {
+		case io::EOF =>
+			return par.line: syntaxerr;
+		case let rn: rune =>
+			yield rn;
+		};
+		if (have != x) {
+			return par.line: syntaxerr;
+		};
+		if (x == '\n') {
+			par.line += 1;
+		};
+	case let x: str =>
+		let iter = strings::iter(x);
+		for (true) match (strings::next(&iter)) {
+		case let rn: rune =>
+			want(par, rn)?;
+		case void =>
+			break;
+		};
+	case let ws: whitespace =>
+		let n = 0;
+		for (true; n += 1) match (scanrune(par)?) {
+		case io::EOF =>
+			break;
+		case let rn: rune =>
+			if (!ascii::isspace(rn)) {
+				unreadrune(par, rn);
+				break;
+			};
+			if (rn == '\n') {
+				par.line += 1;
+			};
+		};
+		if (ws && n < 1) {
+			return par.line: syntaxerr;
+		};
+		hadws = n >= 1;
+	};
+	return hadws;
+};
+
+fn scanrune(par: *parser) (rune | io::EOF | error) = {
+	if (par.unread != types::RUNE_MAX) {
+		const rn = par.unread;
+		par.unread = types::RUNE_MAX;
+		return rn;
+	};
+
+	let b: [4]u8 = [0...];
+
+	if (par.cursor >= len(par.buf)) {
+		return io::EOF;
+	};
+	b[0] = par.buf[par.cursor];
+	par.cursor += 1;
+
+	const sz = match (utf8::utf8sz(b[0])) {
+	case let z: size =>
+		yield z;
+	case =>
+		return utf8::invalid;
+	};
+
+	if (sz == 1) {
+		return b[0]: u32: rune;
+	};
+
+	if (par.cursor - 1 + sz >= len(par.buf)) {
+		return utf8::invalid;
+	};
+	// avoid memcpy call overhead
+	for (let i = 0z; i < sz; i += 1) {
+		b[1 + i] = par.buf[par.cursor + i];
+	};
+	par.cursor += sz - 1;
+
+	let dec = utf8::decode(b[..sz]);
+	match (utf8::next(&dec)?) {
+	case let r: rune =>
+		return r;
+	case utf8::more =>
+		return utf8::invalid;
+	case =>
+		return io::EOF;
+	};
+};
+
+fn unreadrune(par: *parser, r: rune) void = {
+	assert(par.unread == types::RUNE_MAX, "Cannot unread more than 1 rune");
+	par.unread = r;
+};