about summary refs log tree commit diff
path: root/make-regex.fnl
blob: 6e73ede7a0ffedc4a9a700f0085622f7c39d57a9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
(fn fv [...] (print ((require :fennel.view) ...)))

;; this program optimizes regex using a few fixed patterns
;; (that can be adjusted)
;; its input is a list of possible strings we want to match (specifically as a
;; unicode data file: read emoji-test.txt to see how that looks)

;; this program is specifically optimized to make a template for the YAGPDB
;; discord bot, which is why i'm trying to make it shorter in the first place.
;; there are a few mismatches between things that discord thinks are emoji and
;; things that unicode thinks are emoji, like regional indicators. i add these
;; outside of the program to save myself the thought.

;; a replacement has its id, which it is replaced with in the string
;; it has to be something that doesn't appear in the regex. thankfully, we
;; escape all of the ascii characters from the emoji, so only \, x, hex
;; numbers, and regex characters are unsfae. here we'll use capital letters
;; from g to w
;; then it has the Set of possibilities that we want to replace, and the regex
;; for those
;; we do a replacement when all of the possibilities can be collapsed into one
;; we also have
(local replacements
  ;; gender
  [{:id "g"
    :options ["\u{200d}\u{2640}"
              "\u{200d}\u{2642}"]
    :regex "\u{200d}[\u{2640}\u{2642}]"}
   ;; skin tone (including null)
   {:id "s"
    :options ["" "\u{1f3fb}" "\u{1f3fc}" "\u{1f3fd}" "\u{1f3fe}" "\u{1f3ff}"]
    :regex "[\u{1f3fb}\u{1f3fc}\u{1f3fd}\u{1f3fe}\u{1f3ff}]?"}
   ;; skin tone (excluding null)
   {:id "S"
    :options ["\u{1f3fb}" "\u{1f3fc}" "\u{1f3fd}" "\u{1f3fe}" "\u{1f3ff}"]
    :regex "[\u{1f3fb}\u{1f3fc}\u{1f3fd}\u{1f3fe}\u{1f3ff}]"}
   ;; variant selector thing
   {:id "v"
    :options ["" "\u{fe0f}"]
    :regex "\u{fe0f}?"}
   ;; man, woman
   ;; there's so much gender weirdness in unicode combinations that including
   ;; "person" here makes things longer :/
   {:id "p"
    :options ["\u{1f468}" "\u{1f469}"]
    :regex "[\u{1f468}\u{1f469}]"}
   ;; just person
   ;; we do this because yagpdb's interface large codepoints as 2 chars
   {:id "P"
    :options ["\u{1f9d1}"]
    :regex "\u{1f9d1}"}
   ])

(local emoji
  (icollect [line (io.lines "emoji-test.txt")]
    (do
      (var x "")
      (each [guy (string.gmatch line "[^ ]+") :until (. {";" true
                                                         "#" true}
                                                        (string.sub guy 1 1))]
        (let [num (tonumber guy 16)]
          (set x (.. x (if (< num 128)
                           (string.format "\\x%02x" num)
                           (utf8.char num))))))

      (if (> (length x) 0)
          x
          nil))))

;; find possible matches of options. returns list of prefix-suffix pairs to put
;; around possibilities etc
(fn find-matches [str replacement]
  (local out [])
  (each [_ option (ipairs replacement.options)]
    (var start 1)
    (while (string.find str option start true)
      (let [(x y) (string.find str option start true)]
        (set start (+ x 1))
        (table.insert out [(string.sub str 1 (- x 1)) (string.sub str (+ y 1))]))))
  out)

;; with a list of pairs and a list of options, generate a list of reductions,
;; ignoring whether they are actually valid
(fn iterate-options [pattern-pairs replacement]
  (icollect [_ pair (ipairs pattern-pairs)]
    {:pattern (.. (. pair 1) replacement.id (. pair 2))
     :matches
     (icollect [_ option (ipairs replacement.options)]
       (.. (. pair 1) option (. pair 2)))}))

;; reduce set of patterns via one replacement. only does one round, which is to
;; say that if there are multiple of the same replacement that can be done on
;; one string (i.e. two different skin tones in the kissing emoji sequences),
;; we'll have to run it twice. for this reason we also return a boolean saying
;; if we actually modified the data at all.
(fn reduce-set [patterns replacement]
  (var out [])
  (var dirty? false)
  (local map (collect [_ k (ipairs patterns)] (values k true)))
  (each [_ pattern (ipairs patterns)]
    (local possibilities (iterate-options (find-matches pattern replacement)
                                          replacement))
    (each [_ reduction (ipairs possibilities)]
      (when (accumulate [ok true _ v (ipairs reduction.matches)]
              (and ok (. map v)))
        (table.insert out reduction.pattern)
        (each [_ v (ipairs reduction.matches)]
          (tset map v false))))
    (if (. map pattern)
        (do
          (table.insert out pattern)
          (tset map pattern false))
        (set dirty? true)))
  (values out dirty?))

(fn fully-reduce-set [patterns replacement]
  (var out patterns)
  (var dirty? true)
  (while dirty?
    (set (out dirty?) (reduce-set out replacement)))
  out)

(fn reduce-all [patterns replacements]
  (accumulate [out patterns _ r (ipairs replacements)]
    (fully-reduce-set out r)))

;; TODO: this is a bad name
;; we find the longest possible string for any set of replacements
;; for greedy matching/sorting weirdness
(fn process-replacements [pattern replacements]
  (accumulate [out pattern _ r (ipairs replacements)]
    (string.gsub pattern r.id
                 (accumulate [x "" _ v (ipairs r.options)]
                   (if (< (utf8.len x) (utf8.len v))
                       v
                       x)))))

;; sort by length and then by unicode value (longest first)
(fn utf8-comp [a b replacements]
  (local a (process-replacements a replacements))
  (local b (process-replacements b replacements))
  (if (not= (utf8.len a) (utf8.len b))
      (> (utf8.len a) (utf8.len b))
      (not= (utf8.codepoint a) (utf8.codepoint b))
      (< (utf8.codepoint a) (utf8.codepoint b))
      (= (utf8.len a) 1)
      false
      true
      (utf8-comp (string.sub a (utf8.offset a 2))
                 (string.sub b (utf8.offset b 2))
                 [])))

;; inner unescaped regex with ranges etc
;; we do the longest patterns first since yagpdb's regex engine wants to
;; match the first allowed OR clause
(fn inner-regex [final-set replacements]
  (local things (icollect [_ v (ipairs final-set)] v))
  (table.sort things #(utf8-comp $1 $2 replacements))
  (var map (collect [_ v (ipairs final-set)] (values v true)))
  (var replacement-map (collect [_ v (ipairs replacements)]
                         (values v.id v)))
  (var out [""])
  (each [_ v (ipairs things)]
    (when (or (< 1 (utf8.len v))
              (. replacement-map v))
      (table.insert out (.. v "|"))))
  (table.insert out "[")
  (each [_ v (ipairs things)]
    (when (and (= 1 (utf8.len v)) (not (. replacement-map v)))
      (let [min-codepoint (utf8.codepoint v)
            max-codepoint (do
                            (var x min-codepoint)
                            (while (. map (utf8.char x))
                              (set x (+ x 1)))
                            (- x 1))]
        (if (< 2 (- max-codepoint min-codepoint))
            (do
              (table.insert out (.. (utf8.char min-codepoint)
                                    "-"
                                    (utf8.char max-codepoint)))
              (for [i min-codepoint max-codepoint]
                (tset map (utf8.char i) false)))
            (do
              (table.insert out v)
              (tset map v false))))))
  (table.insert out "]")
  (table.concat out))

;;; yagpdb-specific stuff from here on out!

(fn yagpdb-escape [str]
  (-> str
      (string.gsub "\\" "\\\\")
      (string.gsub "\"" "\\\"")))

(fn yagpdb-output [patterns replacements]
  (var out [])
  (table.insert out
                (string.format
                  "{{ $regex := \"%s\" }}"
                  (yagpdb-escape
                    (inner-regex (reduce-all emoji replacements) replacements))))
  (icollect [_ replacement (pairs replacements) :into out]
    (string.format "{{ $regex = joinStr \"%s\" (split $regex \"%s\") }}"
                   (yagpdb-escape replacement.regex)
                   (yagpdb-escape replacement.id)))
  (table.concat out "\n"))

(io.write (yagpdb-output emoji replacements))