diff options
Diffstat (limited to 'csv.sh')
-rwxr-xr-x | csv.sh | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/csv.sh b/csv.sh new file mode 100755 index 0000000..52b87b9 --- /dev/null +++ b/csv.sh @@ -0,0 +1,101 @@ +#!/bin/sh + +# Parsing CSV files with sed + +# we convert csv files into a simplified interchange format: +# text characters separated by tabs! we escape tabs, newlines, and backslashes +# into \t, \n, and \\. + +# these functions process on standard input and standard output. +# we allow CR LFs in input but we don't output them by default. + +# made by Natalia Posting in 2023 + +from_csv () { + # set the locale to be nice. you can mess with this if you know better + LC_ALL=C sed -n ' + # immediately we escape backslashes and tabs and normalize CRLF + # while tab/CR characters can be input raw in a shell script, it makes + # copying and formatting nasty, so we use printf + s/'"$(printf "\r")"'$//; + s/\\/\\\\/g; + s/'"$(printf "\t")"'/\\t/g; + + # here begins the main loop of our process: + # we read a line and append it to the hold space (and give ourselves a copy) + # if it parses, we are going to destroy the hold space and print the + # formatted version, but if it does not parse we append the next line and repeat + H;x;h; + # because of the way H works we have to delete an initial newline from the space + s/^\n//; + # escape newlines too + s/\n/\\n/g; + + # pretend say our input is: + # -> a,"b,c""e",e + # we double every comma we use and surround the line with commas. + # this also affects commas *inside* strings, but we can clean it out later + s/,/,,/g; s/$/,/;s/^/,/; + + # -> ,a,,"b,,c""e",,e, + + # now we do a global match for a valid CSV entry surrounded by commas + # we match: + # -> ,a, + # -> ,"b,,c""e", + # -> ,e, + # + # and then we replace them with the parameters separated by tabs. + # -> a[TAB]"b,,c""e"[TAB]e[TAB] + # + # we do this really leniently -- any sequence of unquoted literals + # and quoted strings in order are allowed (i.e. "a"b"efef"), even + # though it is not really RFC compliant or necessary. + s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g; + + # now imagine we got an incomplete string or invalid CSV input. + # -> "a,b,c,""etc"" + # and then process it up to before the last command: + # -> ,"a,,b,,c,,""etc"", + # the match starting with the first comma fails since the quote is not finished. + # the next possible match is ",,", and since every comma is doubled, every match + # from there on is guaranteed to just match the pairs of commas + # which means that we never manage to replace the final comma in the line. + + # therefore, any line ending in a comma is incomplete, so we keep the hold space + # and start parsing a new line: + /,$/d; + + # remove trailing tab and fix the commas + s/.$//; + s/,,/,/g; + + # remove extraneous quotes, keeping in mind the double quotation mark rule + s/"\([^"]*\(""[^"]*\)*\)"/\1/g; + # turn double quotes into normal ones + s/""/"/g; + + # print the finished formatted line, and make sure we clear the hold space + p;s/.*//;h + ' +} + +to_csv () { + # set the locale to be nice. you can mess with this if you know better + LC_ALL=C sed ' + # escape existing quotes + s/"/""/g; + # surround with quotes + s/'"$(printf "\t")"'/","/g; + s/^/"/; + s/$/"/; + # replace \\ with itself followed by space. we do this so that we can + # cleanly separate any pair of backslashes before processing other escapes + s/\\\\/& /g; + # process other escapes (you could implement your own) + s/\\n/\n/g;s/\\t/'"$(printf "\t")"'/g; + + # undo the spacing + s/\\\\ /\\/g + ' +} |