From 77ec331c7dacad6b5028783288eb705bdb9ad22e Mon Sep 17 00:00:00 2001 From: equa Date: Thu, 27 Jul 2023 16:08:28 -0400 Subject: initial commit --- README.md | 48 +++++++++++++++++++++++++++++ csv-min.sh | 23 ++++++++++++++ csv.sh | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fuzz.lua | 22 ++++++++++++++ test.sh | 41 +++++++++++++++++++++++++ tests.txt | 17 +++++++++++ 6 files changed, 252 insertions(+) create mode 100644 README.md create mode 100755 csv-min.sh create mode 100755 csv.sh create mode 100644 fuzz.lua create mode 100755 test.sh create mode 100644 tests.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..806cb1d --- /dev/null +++ b/README.md @@ -0,0 +1,48 @@ +# csv.sh + +parse CSV files with pure POSIX shell! + +## the short way + +(see `csv-min.sh`) + +to convert from csv to tab-separated strings: + +``` +LC_ALL=C sed -n 's/'"$(printf "\r")"'$//;s/\\/\\\\/g;s/'"$(printf "\t")"'/\\t/g;H;x;h;s/^\n//;s/\n/\\n/g;s/,/,,/g;s/$/,/;s/^/,/;s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g;/,$/d;s/.$//;s/,,/,/g;s/"\([^"]*\(""[^"]*\)*\)"/\1/g;s/""/"/g;p;s/.*//;h' +``` + +tabs, newlines, and backslashes are escaped into `\t`, `\n`, and `\\`, respectively. + +to convert back to CSV: + +``` +LC_ALL=C sed 's/"/""/g;s/'"$(printf "\t")"'/","/g;s/^/"/;s/$/"/;s/\\\\/& /g;s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g;s/\\\\ /\\/g;' +``` + +(this program doesn't output CR LFs, but you can modify it to! `sed 's/$/'"$(printf "\r")"'/'`) + +## what? + +see `csv.sh`. + +## disclaimer + +you shouldn't trust yourself to verify a CSV parser, let alone trust me to write one! + +CSV is an amalgamation of formats, loosely described by [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180). i try to be slightly more lenient than RFC 4180, and i tried my parser on output from a variety of programs, but i don't guarantee correctness for weird files. + +## license + +made by [Natalia Posting](https://equa.space/) in 2023. + +Permission to use, copy, modify, and/or distribute this software for +any purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE +FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY +DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/csv-min.sh b/csv-min.sh new file mode 100755 index 0000000..3a12dcd --- /dev/null +++ b/csv-min.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +# Parsing CSV files with sed + +# we convert csv files into a simplified interchange format: +# text characters separated by tabs! we escape tabs, newlines, and backslashes +# into \t, \n, and \\. + +# these functions process on standard input and standard output. +# we allow CR LFs in input but we don't output them by default. + +# see csv.sh for an unminified version. + +# made by Natalia Posting in 2023 + +from_csv () { + LC_ALL=C sed -n 's/'"$(printf "\r")"'$//;s/\\/\\\\/g;s/'"$(printf "\t")"'/\\t/g;H;x;h;s/^\n//;s/\n/\\n/g;s/,/,,/g;s/$/,/;s/^/,/;s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g;/,$/d;s/.$//;s/,,/,/g;s/"\([^"]*\(""[^"]*\)*\)"/\1/g;s/""/"/g;p;s/.*//;h' +} + + +to_csv () { + LC_ALL=C sed 's/"/""/g;s/'"$(printf "\t")"'/","/g;s/^/"/;s/$/"/;s/\\\\/& /g;s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g;s/\\\\ /\\/g;' +} diff --git a/csv.sh b/csv.sh new file mode 100755 index 0000000..52b87b9 --- /dev/null +++ b/csv.sh @@ -0,0 +1,101 @@ +#!/bin/sh + +# Parsing CSV files with sed + +# we convert csv files into a simplified interchange format: +# text characters separated by tabs! we escape tabs, newlines, and backslashes +# into \t, \n, and \\. + +# these functions process on standard input and standard output. +# we allow CR LFs in input but we don't output them by default. + +# made by Natalia Posting in 2023 + +from_csv () { + # set the locale to be nice. you can mess with this if you know better + LC_ALL=C sed -n ' + # immediately we escape backslashes and tabs and normalize CRLF + # while tab/CR characters can be input raw in a shell script, it makes + # copying and formatting nasty, so we use printf + s/'"$(printf "\r")"'$//; + s/\\/\\\\/g; + s/'"$(printf "\t")"'/\\t/g; + + # here begins the main loop of our process: + # we read a line and append it to the hold space (and give ourselves a copy) + # if it parses, we are going to destroy the hold space and print the + # formatted version, but if it does not parse we append the next line and repeat + H;x;h; + # because of the way H works we have to delete an initial newline from the space + s/^\n//; + # escape newlines too + s/\n/\\n/g; + + # pretend say our input is: + # -> a,"b,c""e",e + # we double every comma we use and surround the line with commas. + # this also affects commas *inside* strings, but we can clean it out later + s/,/,,/g; s/$/,/;s/^/,/; + + # -> ,a,,"b,,c""e",,e, + + # now we do a global match for a valid CSV entry surrounded by commas + # we match: + # -> ,a, + # -> ,"b,,c""e", + # -> ,e, + # + # and then we replace them with the parameters separated by tabs. + # -> a[TAB]"b,,c""e"[TAB]e[TAB] + # + # we do this really leniently -- any sequence of unquoted literals + # and quoted strings in order are allowed (i.e. "a"b"efef"), even + # though it is not really RFC compliant or necessary. + s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g; + + # now imagine we got an incomplete string or invalid CSV input. + # -> "a,b,c,""etc"" + # and then process it up to before the last command: + # -> ,"a,,b,,c,,""etc"", + # the match starting with the first comma fails since the quote is not finished. + # the next possible match is ",,", and since every comma is doubled, every match + # from there on is guaranteed to just match the pairs of commas + # which means that we never manage to replace the final comma in the line. + + # therefore, any line ending in a comma is incomplete, so we keep the hold space + # and start parsing a new line: + /,$/d; + + # remove trailing tab and fix the commas + s/.$//; + s/,,/,/g; + + # remove extraneous quotes, keeping in mind the double quotation mark rule + s/"\([^"]*\(""[^"]*\)*\)"/\1/g; + # turn double quotes into normal ones + s/""/"/g; + + # print the finished formatted line, and make sure we clear the hold space + p;s/.*//;h + ' +} + +to_csv () { + # set the locale to be nice. you can mess with this if you know better + LC_ALL=C sed ' + # escape existing quotes + s/"/""/g; + # surround with quotes + s/'"$(printf "\t")"'/","/g; + s/^/"/; + s/$/"/; + # replace \\ with itself followed by space. we do this so that we can + # cleanly separate any pair of backslashes before processing other escapes + s/\\\\/& /g; + # process other escapes (you could implement your own) + s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g; + + # undo the spacing + s/\\\\ /\\/g + ' +} diff --git a/fuzz.lua b/fuzz.lua new file mode 100644 index 0000000..cbc4b43 --- /dev/null +++ b/fuzz.lua @@ -0,0 +1,22 @@ +print("CREATE TABLE stuff (x string, y string, z srting);") + +local chars = { "\"", "0", ",", "\t", "\n", "\\" } + +local function literal () + local out = {"'"} + for i = 1, math.random(1, 40) do + table.insert(out, chars[math.random(#chars)]) + end + + table.insert(out, "'") + + return table.concat(out) +end + +for i = 1, tonumber(arg[1]) do + print(string.format("INSERT INTO stuff(x, y, z) VALUES (%s, %s, %s);", + literal(), literal(), literal())) +end + +print(".mode csv") +print("SELECT * from stuff;") diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..a9b4e44 --- /dev/null +++ b/test.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +# basic testing framework! +# we use lua, sqlite3 and sqldiff to generate and compare csv files. +# +# TODO: test the behavior of malformed csv files properly + +TARGETS="./csv.sh ./csv-min.sh" +FUZZ=./fuzz.lua + +assert () { + test_name=$1 + shift + # TODO make this nicer somehow + "$@" || printf " test '%s' failed:\n %.100s\n" "$test_name" "$*" +} + +for target in $TARGETS; do + printf "testing %s\n" "$target" + . $target + + rm -rf tmp/ + mkdir -p tmp/ + + assert 'fuzz: correct number of entries' \ + [ "$(lua "$FUZZ" 1000 | sqlite3 | from_csv | wc -l)" -eq 1000 ] + + lua "$FUZZ" 1000 | sqlite3 > "tmp/a.csv" + from_csv < "tmp/a.csv" | to_csv > "tmp/b.csv" + for x in a b; do + echo "create table stuff(a, b, c);\n.import tmp/$x.csv stuff --csv" | sqlite3 "tmp/$x.sqlite3" + done + + assert "processed CSV is the same" [ -z "$(sqldiff tmp/a.sqlite3 tmp/b.sqlite3)" ] + + sed -n '/^< /s///p' tests.txt > tmp/a.txt + sed -n '/^> /s///p' tests.txt | from_csv > tmp/b.txt + assert "tests.txt" diff tmp/a.txt tmp/b.txt + + rm -r tmp/ +done diff --git a/tests.txt b/tests.txt new file mode 100644 index 0000000..6b29d03 --- /dev/null +++ b/tests.txt @@ -0,0 +1,17 @@ +# your text editor or git configuration might break this. be careful! +# (we have weird line endings.) +> a,b,c +< a b c +> a,b,c +< a b c +> "\n" +< \\n +> "\ +> " +< \\\n +> a b c,e f g, +< a\tb\tc e\tf\tg +> """" +< " +> "a,b,","e"f"g",a"b" "" +< a,b, efg ab -- cgit 1.3.0-6-gf8a5