From 1cdf6c38216e47efd2884cb43fcd5239a876e588 Mon Sep 17 00:00:00 2001 From: equa Date: Thu, 27 Jul 2023 16:45:21 -0400 Subject: better docs --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'README.md') diff --git a/README.md b/README.md index 806cb1d..91972e3 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,14 @@ parse CSV files with pure POSIX shell! +CSV files have weird quoting rules, so parsing them with `awk` or `cut` +won't cut it on its own. but we can convert them to a format that shell utilities like: + ## the short way (see `csv-min.sh`) -to convert from csv to tab-separated strings: +to convert from a csv file to lines of tab-separated values: ``` LC_ALL=C sed -n 's/'"$(printf "\r")"'$//;s/\\/\\\\/g;s/'"$(printf "\t")"'/\\t/g;H;x;h;s/^\n//;s/\n/\\n/g;s/,/,,/g;s/$/,/;s/^/,/;s/,\([^",]*\("[^"]*\(""[^"]*\)*"[^",]*\)*\),/\1'"$(printf "\t")"'/g;/,$/d;s/.$//;s/,,/,/g;s/"\([^"]*\(""[^"]*\)*\)"/\1/g;s/""/"/g;p;s/.*//;h' @@ -14,6 +17,16 @@ LC_ALL=C sed -n 's/'"$(printf "\r")"'$//;s/\\/\\\\/g;s/'"$(printf "\t")"'/\\t/g; tabs, newlines, and backslashes are escaped into `\t`, `\n`, and `\\`, respectively. +> → `foo,bar,"baz ""quuz"" \etc"` + +> ← `foo[TAB]bar[TAB]baz "quuz" \\etc` + +now you can parse with regular shell tools: + +- `cut -f2` +- `awk -F'\t' '{print $2 + $3}'` +- etc. + to convert back to CSV: ``` -- cgit 1.3.0-6-gf8a5