From 7487469113eed2daea6c46d2dc50818f4cd62951 Mon Sep 17 00:00:00 2001 From: equa Date: Fri, 24 Dec 2021 22:59:02 -0600 Subject: init --- make-regex.fnl | 209 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 make-regex.fnl (limited to 'make-regex.fnl') diff --git a/make-regex.fnl b/make-regex.fnl new file mode 100644 index 0000000..b0e03f3 --- /dev/null +++ b/make-regex.fnl @@ -0,0 +1,209 @@ +(fn fv [...] (print ((require :fennel.view) ...))) + +;; this program optimizes regex using a few fixed patterns +;; (that can be adjusted) +;; its input is a list of possible strings we want to match (specifically as a +;; unicode data file: read emoji-test.txt to see how that looks) + +;; this program is specifically optimized to make a template for the YAGPDB +;; discord bot, which is why i'm trying to make it shorter in the first place. +;; there are a few mismatches between things that discord thinks are emoji and +;; things that unicode thinks are emoji, like regional indicators. i add these +;; outside of the program to save myself the thought. + +;; a replacement has its id, which it is replaced with in the string +;; it has to be something that doesn't appear in the regex. thankfully, we +;; escape all of the ascii characters from the emoji, so only \, x, hex +;; numbers, and regex characters are unsfae. here we'll use capital letters +;; from g to w +;; then it has the Set of possibilities that we want to replace, and the regex +;; for those +;; we do a replacement when all of the possibilities can be collapsed into one +;; we also have +(local replacements + ;; gender + [{:id "g" + :options ["\u{200d}\u{2640}" + "\u{200d}\u{2642}"] + :regex "[\u{2640}\u{2642}]"} + ;; skin tone (including null) + {:id "s" + :options ["" "\u{1f3fb}" "\u{1f3fc}" "\u{1f3fd}" "\u{1f3fe}" "\u{1f3ff}"] + :regex "[\u{1f3fb}\u{1f3fc}\u{1f3fd}\u{1f3fe}\u{1f3ff}]?"} + ;; skin tone (excluding null) + {:id "S" + :options ["\u{1f3fb}" "\u{1f3fc}" "\u{1f3fd}" "\u{1f3fe}" "\u{1f3ff}"] + :regex "[\u{1f3fb}\u{1f3fc}\u{1f3fd}\u{1f3fe}\u{1f3ff}]"} + ;; variant selector thing + {:id "v" + :options ["" "\u{fe0f}"] + :regex "\u{fe0f}?"} + ;; man, woman + ;; there's so much gender weirdness in unicode combinations that including + ;; "person" here makes things longer :/ + {:id "p" + :options ["\u{1f468}" "\u{1f469}"] + :regex "[\u{1f468}\u{1f469}]"} + ;; just person + ;; we do this because yagpdb's interface large codepoints as 2 chars + {:id "P" + :options ["\u{1f9d1}"] + :regex "\u{1f9d1}"} + ]) + +(local emoji + (icollect [line (io.lines "emoji-test.txt")] + (do + (var x "") + (each [guy (string.gmatch line "[^ ]+") :until (. {";" true + "#" true} + (string.sub guy 1 1))] + (let [num (tonumber guy 16)] + (set x (.. x (if (< num 128) + (string.format "\\x%02x" num) + (utf8.char num)))))) + + (if (> (length x) 0) + x + nil)))) + +;; find possible matches of options. returns list of prefix-suffix pairs to put +;; around possibilities etc +(fn find-matches [str replacement] + (local out []) + (each [_ option (ipairs replacement.options)] + (var start 1) + (while (string.find str option start true) + (let [(x y) (string.find str option start true)] + (set start (+ x 1)) + (table.insert out [(string.sub str 1 (- x 1)) (string.sub str (+ y 1))])))) + out) + +;; with a list of pairs and a list of options, generate a list of reductions, +;; ignoring whether they are actually valid +(fn iterate-options [pattern-pairs replacement] + (icollect [_ pair (ipairs pattern-pairs)] + {:pattern (.. (. pair 1) replacement.id (. pair 2)) + :matches + (icollect [_ option (ipairs replacement.options)] + (.. (. pair 1) option (. pair 2)))})) + +;; reduce set of patterns via one replacement. only does one round, which is to +;; say that if there are multiple of the same replacement that can be done on +;; one string (i.e. two different skin tones in the kissing emoji sequences), +;; we'll have to run it twice. for this reason we also return a boolean saying +;; if we actually modified the data at all. +(fn reduce-set [patterns replacement] + (var out []) + (var dirty? false) + (local map (collect [_ k (ipairs patterns)] (values k true))) + (each [_ pattern (ipairs patterns)] + (local possibilities (iterate-options (find-matches pattern replacement) + replacement)) + (each [_ reduction (ipairs possibilities)] + (when (accumulate [ok true _ v (ipairs reduction.matches)] + (and ok (. map v))) + (table.insert out reduction.pattern) + (each [_ v (ipairs reduction.matches)] + (tset map v false)))) + (if (. map pattern) + (do + (table.insert out pattern) + (tset map pattern false)) + (set dirty? true))) + (values out dirty?)) + +(fn fully-reduce-set [patterns replacement] + (var out patterns) + (var dirty? true) + (while dirty? + (set (out dirty?) (reduce-set out replacement))) + out) + +(fn reduce-all [patterns replacements] + (accumulate [out patterns _ r (ipairs replacements)] + (fully-reduce-set out r))) + +;; TODO: this is a bad name +;; we find the longest possible string for any set of replacements +;; for greedy matching/sorting weirdness +(fn process-replacements [pattern replacements] + (accumulate [out pattern _ r (ipairs replacements)] + (string.gsub pattern r.id + (accumulate [x "" _ v (ipairs r.options)] + (if (< (utf8.len x) (utf8.len v)) + v + x))))) + +;; sort by length and then by unicode value (longest first) +(fn utf8-comp [a b replacements] + (local a (process-replacements a replacements)) + (local b (process-replacements b replacements)) + (if (not= (utf8.len a) (utf8.len b)) + (> (utf8.len a) (utf8.len b)) + (not= (utf8.codepoint a) (utf8.codepoint b)) + (< (utf8.codepoint a) (utf8.codepoint b)) + (= (utf8.len a) 1) + false + true + (utf8-comp (string.sub a (utf8.offset a 2)) + (string.sub b (utf8.offset b 2)) + []))) + +;; inner unescaped regex with ranges etc +;; we do the longest patterns first since yagpdb's regex engine wants to +;; match the first allowed OR clause +(fn inner-regex [final-set replacements] + (local things (icollect [_ v (ipairs final-set)] v)) + (table.sort things #(utf8-comp $1 $2 replacements)) + (var map (collect [_ v (ipairs final-set)] (values v true))) + (var replacement-map (collect [_ v (ipairs replacements)] + (values v.id v))) + (var out [""]) + (each [_ v (ipairs things)] + (when (or (< 1 (utf8.len v)) + (. replacement-map v)) + (table.insert out (.. v "|")))) + (table.insert out "[") + (each [_ v (ipairs things)] + (when (and (= 1 (utf8.len v)) (not (. replacement-map v))) + (let [min-codepoint (utf8.codepoint v) + max-codepoint (do + (var x min-codepoint) + (while (. map (utf8.char x)) + (set x (+ x 1))) + (- x 1))] + (if (< 2 (- max-codepoint min-codepoint)) + (do + (table.insert out (.. (utf8.char min-codepoint) + "-" + (utf8.char max-codepoint))) + (for [i min-codepoint max-codepoint] + (tset map (utf8.char i) false))) + (do + (table.insert out v) + (tset map v false)))))) + (table.insert out "]") + (table.concat out)) + +;;; yagpdb-specific stuff from here on out! + +(fn yagpdb-escape [str] + (-> str + (string.gsub "\\" "\\\\") + (string.gsub "\"" "\\\""))) + +(fn yagpdb-output [patterns replacements] + (var out []) + (table.insert out + (string.format + "{{ $regex := \"%s\" }}" + (yagpdb-escape + (inner-regex (reduce-all emoji replacements) replacements)))) + (icollect [_ replacement (pairs replacements) :into out] + (string.format "{{ $regex = joinStr \"%s\" (split $regex \"%s\") }}" + (yagpdb-escape replacement.regex) + (yagpdb-escape replacement.id))) + (table.concat out "\n")) + +(io.write (yagpdb-output emoji replacements)) -- cgit 1.3.0-6-gf8a5