initial commit

author: equa <equaa@protonmail.com> 2023-07-27 16:08:28 -0400
committer: equa <equaa@protonmail.com> 2023-07-27 16:16:37 -0400
commit: 77ec331c7dacad6b5028783288eb705bdb9ad22e (patch)
tree: da634e1e17752a34e0898592f7735fb1679066e7
6 files changed, 252 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..806cb1d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,48 @@
+# csv.sh
+
+parse CSV files with pure POSIX shell!
+
+## the short way
+
+(see `csv-min.sh`)
+
+to convert from csv to tab-separated strings:
+
+```
+LC_ALL=C sed -n 's/'"$(printf "\r")"'$//;s/\\/\\\\/g;s/'"$(printf "\t")"'/\\t/g;H;x;h;s/^\n//;s/\n/\\n/g;s/,/,,/g;s/$/,/;s/^/,/;s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g;/,$/d;s/.$//;s/,,/,/g;s/"\([^"]*\(""[^"]*\)*\)"/\1/g;s/""/"/g;p;s/.*//;h'
+```
+
+tabs, newlines, and backslashes are escaped into `\t`, `\n`, and `\\`, respectively.
+
+to convert back to CSV:
+
+```
+LC_ALL=C sed 's/"/""/g;s/'"$(printf "\t")"'/","/g;s/^/"/;s/$/"/;s/\\\\/& /g;s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g;s/\\\\ /\\/g;'
+```
+
+(this program doesn't output CR LFs, but you can modify it to! `sed 's/$/'"$(printf "\r")"'/'`)
+
+## what?
+
+see `csv.sh`.
+
+## disclaimer
+
+you shouldn't trust yourself to verify a CSV parser, let alone trust me to write one!
+
+CSV is an amalgamation of formats, loosely described by [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180). i try to be slightly more lenient than RFC 4180, and i tried my parser on output from a variety of programs, but i don't guarantee correctness for weird files.
+
+## license
+
+made by [Natalia Posting](https://equa.space/) in 2023.
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
+FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/csv-min.sh b/csv-min.sh
new file mode 100755
index 0000000..3a12dcd
--- /dev/null
+++ b/csv-min.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# Parsing CSV files with sed
+
+# we convert csv files into a simplified interchange format:
+# text characters separated by tabs! we escape tabs, newlines, and backslashes
+# into \t, \n, and \\.
+
+# these functions process on standard input and standard output.
+# we allow CR LFs in input but we don't output them by default.
+
+# see csv.sh for an unminified version.
+
+# made by Natalia Posting in 2023
+
+from_csv () {
+	LC_ALL=C sed -n 's/'"$(printf "\r")"'$//;s/\\/\\\\/g;s/'"$(printf "\t")"'/\\t/g;H;x;h;s/^\n//;s/\n/\\n/g;s/,/,,/g;s/$/,/;s/^/,/;s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g;/,$/d;s/.$//;s/,,/,/g;s/"\([^"]*\(""[^"]*\)*\)"/\1/g;s/""/"/g;p;s/.*//;h'
+}
+
+
+to_csv () {
+	LC_ALL=C sed 's/"/""/g;s/'"$(printf "\t")"'/","/g;s/^/"/;s/$/"/;s/\\\\/& /g;s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g;s/\\\\ /\\/g;'
+}
diff --git a/csv.sh b/csv.sh
new file mode 100755
index 0000000..52b87b9
--- /dev/null
+++ b/csv.sh
@@ -0,0 +1,101 @@
+#!/bin/sh
+
+# Parsing CSV files with sed
+
+# we convert csv files into a simplified interchange format:
+# text characters separated by tabs! we escape tabs, newlines, and backslashes
+# into \t, \n, and \\.
+
+# these functions process on standard input and standard output.
+# we allow CR LFs in input but we don't output them by default.
+
+# made by Natalia Posting in 2023
+
+from_csv () {
+	# set the locale to be nice. you can mess with this if you know better
+	LC_ALL=C sed -n '
+		# immediately we escape backslashes and tabs and normalize CRLF
+		# while tab/CR characters can be input raw in a shell script, it makes
+		# copying and formatting nasty, so we use printf
+		s/'"$(printf "\r")"'$//;
+		s/\\/\\\\/g;
+		s/'"$(printf "\t")"'/\\t/g;
+
+		# here begins the main loop of our process:
+		# we read a line and append it to the hold space (and give ourselves a copy)
+		# if it parses, we are going to destroy the hold space and print the
+		# formatted version, but if it does not parse we append the next line and repeat
+		H;x;h;
+		# because of the way H works we have to delete an initial newline from the space
+		s/^\n//;
+		# escape newlines too
+		s/\n/\\n/g;
+
+		# pretend say our input is:
+		# -> a,"b,c""e",e
+		# we double every comma we use and surround the line with commas.
+		# this also affects commas *inside* strings, but we can clean it out later
+		s/,/,,/g; s/$/,/;s/^/,/;
+
+		# -> ,a,,"b,,c""e",,e,
+
+		# now we do a global match for a valid CSV entry surrounded by commas
+		# we match:
+		# -> ,a,
+		# -> ,"b,,c""e",
+		# -> ,e,
+		#
+		# and then we replace them with the parameters separated by tabs.
+		# -> a[TAB]"b,,c""e"[TAB]e[TAB]
+		#
+		# we do this really leniently -- any sequence of unquoted literals
+		# and quoted strings in order are allowed (i.e. "a"b"efef"), even
+		# though it is not really RFC compliant or necessary.
+		s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g;
+
+		# now imagine we got an incomplete string or invalid CSV input.
+		# -> "a,b,c,""etc""
+		# and then process it up to before the last command:
+		# -> ,"a,,b,,c,,""etc"",
+		# the match starting with the first comma fails since the quote is not finished.
+		# the next possible match is ",,", and since every comma is doubled, every match
+		# from there on is guaranteed to just match the pairs of commas
+		# which means that we never manage to replace the final comma in the line.
+
+		# therefore, any line ending in a comma is incomplete, so we keep the hold space
+		# and start parsing a new line:
+		/,$/d;
+
+		# remove trailing tab and fix the commas
+		s/.$//;
+		s/,,/,/g;
+
+		# remove extraneous quotes, keeping in mind the double quotation mark rule
+		s/"\([^"]*\(""[^"]*\)*\)"/\1/g;
+		# turn double quotes into normal ones
+		s/""/"/g;
+
+		# print the finished formatted line, and make sure we clear the hold space
+		p;s/.*//;h
+		'
+}
+
+to_csv () {
+	# set the locale to be nice. you can mess with this if you know better
+	LC_ALL=C sed '
+		# escape existing quotes
+		s/"/""/g;
+		# surround with quotes
+		s/'"$(printf "\t")"'/","/g;
+		s/^/"/;
+		s/$/"/;
+		# replace \\ with itself followed by space. we do this so that we can
+		# cleanly separate any pair of backslashes before processing other escapes
+		s/\\\\/& /g;
+		# process other escapes (you could implement your own)
+		s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g;
+
+		# undo the spacing
+		s/\\\\ /\\/g
+	'
+}
diff --git a/fuzz.lua b/fuzz.lua
new file mode 100644
index 0000000..cbc4b43
--- /dev/null
+++ b/fuzz.lua
@@ -0,0 +1,22 @@
+print("CREATE TABLE stuff (x string, y string, z srting);")
+
+local chars = { "\"", "0", ",", "\t", "\n", "\\" }
+
+local function literal ()
+	local out = {"'"}
+	for i = 1, math.random(1, 40) do
+		table.insert(out, chars[math.random(#chars)])
+	end
+
+	table.insert(out, "'")
+
+	return table.concat(out)
+end
+
+for i = 1, tonumber(arg[1]) do
+	print(string.format("INSERT INTO stuff(x, y, z) VALUES (%s, %s, %s);",
+		literal(), literal(), literal()))
+end
+
+print(".mode csv")
+print("SELECT * from stuff;")
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..a9b4e44
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+# basic testing framework!
+# we use lua, sqlite3 and sqldiff to generate and compare csv files.
+#
+# TODO: test the behavior of malformed csv files properly
+
+TARGETS="./csv.sh ./csv-min.sh"
+FUZZ=./fuzz.lua
+
+assert () {
+	test_name=$1
+	shift
+	# TODO make this nicer somehow
+	"$@" || printf "  test '%s' failed:\n    %.100s\n" "$test_name" "$*"
+}
+
+for target in $TARGETS; do
+	printf "testing %s\n" "$target"
+	. $target
+
+	rm -rf tmp/
+	mkdir -p tmp/
+
+	assert 'fuzz: correct number of entries' \
+		[ "$(lua "$FUZZ" 1000 | sqlite3 | from_csv | wc -l)" -eq 1000 ]
+
+	lua "$FUZZ" 1000 | sqlite3 > "tmp/a.csv"
+	from_csv < "tmp/a.csv" | to_csv > "tmp/b.csv"
+	for x in a b; do
+		echo "create table stuff(a, b, c);\n.import tmp/$x.csv stuff --csv" | sqlite3 "tmp/$x.sqlite3"
+	done
+
+	assert "processed CSV is the same" [ -z "$(sqldiff tmp/a.sqlite3 tmp/b.sqlite3)" ]
+
+	sed -n '/^< /s///p' tests.txt > tmp/a.txt
+	sed -n '/^> /s///p' tests.txt | from_csv > tmp/b.txt
+	assert "tests.txt" diff tmp/a.txt tmp/b.txt
+
+	rm -r tmp/
+done
diff --git a/tests.txt b/tests.txt
new file mode 100644
index 0000000..6b29d03
--- /dev/null
+++ b/tests.txt
@@ -0,0 +1,17 @@
+# your text editor or git configuration might break this. be careful!
+# (we have weird line endings.)
+> a,b,c
+< a	b	c
+> a,b,c
+< a	b	c
+> "\n"
+< \\n
+> "\
+> "
+< \\\n
+> a	b	c,e	f	g,
+< a\tb\tc	e\tf\tg	
+> """"
+< "
+> "a,b,","e"f"g",a"b" ""
+< a,b,	efg	ab
author	equa <equaa@protonmail.com>	2023-07-27 16:08:28 -0400
committer	equa <equaa@protonmail.com>	2023-07-27 16:16:37 -0400
commit	77ec331c7dacad6b5028783288eb705bdb9ad22e (patch)
tree	da634e1e17752a34e0898592f7735fb1679066e7