about summary refs log tree commit diff
path: root/csv.sh
diff options
context:
space:
mode:
Diffstat (limited to 'csv.sh')
-rwxr-xr-xcsv.sh101
1 files changed, 101 insertions, 0 deletions
diff --git a/csv.sh b/csv.sh
new file mode 100755
index 0000000..52b87b9
--- /dev/null
+++ b/csv.sh
@@ -0,0 +1,101 @@
+#!/bin/sh
+
+# Parsing CSV files with sed
+
+# we convert csv files into a simplified interchange format:
+# text characters separated by tabs! we escape tabs, newlines, and backslashes
+# into \t, \n, and \\.
+
+# these functions process on standard input and standard output.
+# we allow CR LFs in input but we don't output them by default.
+
+# made by Natalia Posting in 2023
+
+from_csv () {
+	# set the locale to be nice. you can mess with this if you know better
+	LC_ALL=C sed -n '
+		# immediately we escape backslashes and tabs and normalize CRLF
+		# while tab/CR characters can be input raw in a shell script, it makes
+		# copying and formatting nasty, so we use printf
+		s/'"$(printf "\r")"'$//;
+		s/\\/\\\\/g;
+		s/'"$(printf "\t")"'/\\t/g;
+
+		# here begins the main loop of our process:
+		# we read a line and append it to the hold space (and give ourselves a copy)
+		# if it parses, we are going to destroy the hold space and print the
+		# formatted version, but if it does not parse we append the next line and repeat
+		H;x;h;
+		# because of the way H works we have to delete an initial newline from the space
+		s/^\n//;
+		# escape newlines too
+		s/\n/\\n/g;
+
+		# pretend say our input is:
+		# -> a,"b,c""e",e
+		# we double every comma we use and surround the line with commas.
+		# this also affects commas *inside* strings, but we can clean it out later
+		s/,/,,/g; s/$/,/;s/^/,/;
+
+		# -> ,a,,"b,,c""e",,e,
+
+		# now we do a global match for a valid CSV entry surrounded by commas
+		# we match:
+		# -> ,a,
+		# -> ,"b,,c""e",
+		# -> ,e,
+		#
+		# and then we replace them with the parameters separated by tabs.
+		# -> a[TAB]"b,,c""e"[TAB]e[TAB]
+		#
+		# we do this really leniently -- any sequence of unquoted literals
+		# and quoted strings in order are allowed (i.e. "a"b"efef"), even
+		# though it is not really RFC compliant or necessary.
+		s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g;
+
+		# now imagine we got an incomplete string or invalid CSV input.
+		# -> "a,b,c,""etc""
+		# and then process it up to before the last command:
+		# -> ,"a,,b,,c,,""etc"",
+		# the match starting with the first comma fails since the quote is not finished.
+		# the next possible match is ",,", and since every comma is doubled, every match
+		# from there on is guaranteed to just match the pairs of commas
+		# which means that we never manage to replace the final comma in the line.
+
+		# therefore, any line ending in a comma is incomplete, so we keep the hold space
+		# and start parsing a new line:
+		/,$/d;
+
+		# remove trailing tab and fix the commas
+		s/.$//;
+		s/,,/,/g;
+
+		# remove extraneous quotes, keeping in mind the double quotation mark rule
+		s/"\([^"]*\(""[^"]*\)*\)"/\1/g;
+		# turn double quotes into normal ones
+		s/""/"/g;
+
+		# print the finished formatted line, and make sure we clear the hold space
+		p;s/.*//;h
+		'
+}
+
+to_csv () {
+	# set the locale to be nice. you can mess with this if you know better
+	LC_ALL=C sed '
+		# escape existing quotes
+		s/"/""/g;
+		# surround with quotes
+		s/'"$(printf "\t")"'/","/g;
+		s/^/"/;
+		s/$/"/;
+		# replace \\ with itself followed by space. we do this so that we can
+		# cleanly separate any pair of backslashes before processing other escapes
+		s/\\\\/& /g;
+		# process other escapes (you could implement your own)
+		s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g;
+
+		# undo the spacing
+		s/\\\\ /\\/g
+	'
+}