(fn fv [...] (print ((require :fennel.view) ...))) ;; this program optimizes regex using a few fixed patterns ;; (that can be adjusted) ;; its input is a list of possible strings we want to match (specifically as a ;; unicode data file: read emoji-test.txt to see how that looks) ;; this program is specifically optimized to make a template for the YAGPDB ;; discord bot, which is why i'm trying to make it shorter in the first place. ;; there are a few mismatches between things that discord thinks are emoji and ;; things that unicode thinks are emoji, like regional indicators. i add these ;; outside of the program to save myself the thought. ;; a replacement has its id, which it is replaced with in the string ;; it has to be something that doesn't appear in the regex. thankfully, we ;; escape all of the ascii characters from the emoji, so only \, x, hex ;; numbers, and regex characters are unsfae. here we'll use capital letters ;; from g to w ;; then it has the Set of possibilities that we want to replace, and the regex ;; for those ;; we do a replacement when all of the possibilities can be collapsed into one ;; we also have (local replacements ;; gender [{:id "g" :options ["\u{200d}\u{2640}" "\u{200d}\u{2642}"] :regex "[\u{2640}\u{2642}]"} ;; skin tone (including null) {:id "s" :options ["" "\u{1f3fb}" "\u{1f3fc}" "\u{1f3fd}" "\u{1f3fe}" "\u{1f3ff}"] :regex "[\u{1f3fb}\u{1f3fc}\u{1f3fd}\u{1f3fe}\u{1f3ff}]?"} ;; skin tone (excluding null) {:id "S" :options ["\u{1f3fb}" "\u{1f3fc}" "\u{1f3fd}" "\u{1f3fe}" "\u{1f3ff}"] :regex "[\u{1f3fb}\u{1f3fc}\u{1f3fd}\u{1f3fe}\u{1f3ff}]"} ;; variant selector thing {:id "v" :options ["" "\u{fe0f}"] :regex "\u{fe0f}?"} ;; man, woman ;; there's so much gender weirdness in unicode combinations that including ;; "person" here makes things longer :/ {:id "p" :options ["\u{1f468}" "\u{1f469}"] :regex "[\u{1f468}\u{1f469}]"} ;; just person ;; we do this because yagpdb's interface large codepoints as 2 chars {:id "P" :options ["\u{1f9d1}"] :regex "\u{1f9d1}"} ]) (local emoji (icollect [line (io.lines "emoji-test.txt")] (do (var x "") (each [guy (string.gmatch line "[^ ]+") :until (. {";" true "#" true} (string.sub guy 1 1))] (let [num (tonumber guy 16)] (set x (.. x (if (< num 128) (string.format "\\x%02x" num) (utf8.char num)))))) (if (> (length x) 0) x nil)))) ;; find possible matches of options. returns list of prefix-suffix pairs to put ;; around possibilities etc (fn find-matches [str replacement] (local out []) (each [_ option (ipairs replacement.options)] (var start 1) (while (string.find str option start true) (let [(x y) (string.find str option start true)] (set start (+ x 1)) (table.insert out [(string.sub str 1 (- x 1)) (string.sub str (+ y 1))])))) out) ;; with a list of pairs and a list of options, generate a list of reductions, ;; ignoring whether they are actually valid (fn iterate-options [pattern-pairs replacement] (icollect [_ pair (ipairs pattern-pairs)] {:pattern (.. (. pair 1) replacement.id (. pair 2)) :matches (icollect [_ option (ipairs replacement.options)] (.. (. pair 1) option (. pair 2)))})) ;; reduce set of patterns via one replacement. only does one round, which is to ;; say that if there are multiple of the same replacement that can be done on ;; one string (i.e. two different skin tones in the kissing emoji sequences), ;; we'll have to run it twice. for this reason we also return a boolean saying ;; if we actually modified the data at all. (fn reduce-set [patterns replacement] (var out []) (var dirty? false) (local map (collect [_ k (ipairs patterns)] (values k true))) (each [_ pattern (ipairs patterns)] (local possibilities (iterate-options (find-matches pattern replacement) replacement)) (each [_ reduction (ipairs possibilities)] (when (accumulate [ok true _ v (ipairs reduction.matches)] (and ok (. map v))) (table.insert out reduction.pattern) (each [_ v (ipairs reduction.matches)] (tset map v false)))) (if (. map pattern) (do (table.insert out pattern) (tset map pattern false)) (set dirty? true))) (values out dirty?)) (fn fully-reduce-set [patterns replacement] (var out patterns) (var dirty? true) (while dirty? (set (out dirty?) (reduce-set out replacement))) out) (fn reduce-all [patterns replacements] (accumulate [out patterns _ r (ipairs replacements)] (fully-reduce-set out r))) ;; TODO: this is a bad name ;; we find the longest possible string for any set of replacements ;; for greedy matching/sorting weirdness (fn process-replacements [pattern replacements] (accumulate [out pattern _ r (ipairs replacements)] (string.gsub pattern r.id (accumulate [x "" _ v (ipairs r.options)] (if (< (utf8.len x) (utf8.len v)) v x))))) ;; sort by length and then by unicode value (longest first) (fn utf8-comp [a b replacements] (local a (process-replacements a replacements)) (local b (process-replacements b replacements)) (if (not= (utf8.len a) (utf8.len b)) (> (utf8.len a) (utf8.len b)) (not= (utf8.codepoint a) (utf8.codepoint b)) (< (utf8.codepoint a) (utf8.codepoint b)) (= (utf8.len a) 1) false true (utf8-comp (string.sub a (utf8.offset a 2)) (string.sub b (utf8.offset b 2)) []))) ;; inner unescaped regex with ranges etc ;; we do the longest patterns first since yagpdb's regex engine wants to ;; match the first allowed OR clause (fn inner-regex [final-set replacements] (local things (icollect [_ v (ipairs final-set)] v)) (table.sort things #(utf8-comp $1 $2 replacements)) (var map (collect [_ v (ipairs final-set)] (values v true))) (var replacement-map (collect [_ v (ipairs replacements)] (values v.id v))) (var out [""]) (each [_ v (ipairs things)] (when (or (< 1 (utf8.len v)) (. replacement-map v)) (table.insert out (.. v "|")))) (table.insert out "[") (each [_ v (ipairs things)] (when (and (= 1 (utf8.len v)) (not (. replacement-map v))) (let [min-codepoint (utf8.codepoint v) max-codepoint (do (var x min-codepoint) (while (. map (utf8.char x)) (set x (+ x 1))) (- x 1))] (if (< 2 (- max-codepoint min-codepoint)) (do (table.insert out (.. (utf8.char min-codepoint) "-" (utf8.char max-codepoint))) (for [i min-codepoint max-codepoint] (tset map (utf8.char i) false))) (do (table.insert out v) (tset map v false)))))) (table.insert out "]") (table.concat out)) ;;; yagpdb-specific stuff from here on out! (fn yagpdb-escape [str] (-> str (string.gsub "\\" "\\\\") (string.gsub "\"" "\\\""))) (fn yagpdb-output [patterns replacements] (var out []) (table.insert out (string.format "{{ $regex := \"%s\" }}" (yagpdb-escape (inner-regex (reduce-all emoji replacements) replacements)))) (icollect [_ replacement (pairs replacements) :into out] (string.format "{{ $regex = joinStr \"%s\" (split $regex \"%s\") }}" (yagpdb-escape replacement.regex) (yagpdb-escape replacement.id))) (table.concat out "\n")) (io.write (yagpdb-output emoji replacements))