feat: enhance snippet highlighting and windowing for multi-term queries

This commit is contained in:
Mega Yu
2026-02-10 09:45:56 +08:00
parent 48f0a5ba88
commit 3f8be4f3ed
2 changed files with 148 additions and 64 deletions

View File

@@ -137,10 +137,13 @@ DROP TRIGGER IF EXISTS blocks_au;
(.exec db sql)))
(def ^:private max-snippet-length 250)
(def ^:private snippet-prefix-length 50)
(def ^:private snippet-merge-distance 200)
(def ^:private snippet-highlight-start "$pfts_2lqh>$")
(def ^:private snippet-highlight-end "$<pfts_2lqh$")
(def ^:private snippet-prefix-length 50)
(def ^:private snippet-ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0")
(def ^:private snippet-ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0") ;; \u00A0 is No-Break Space (NBSP)
(def ^:private query-boolean-operators #{"and" "or" "not" "|" "&"})
(def ^:private query-break-chars #{\, \. \; \! \? \uFF0C \u3002 \uFF1B \uFF01 \uFF1F \u3001}) ;; , . ; ! ?
(defn- snippet-by
[content length]
@@ -149,8 +152,7 @@ DROP TRIGGER IF EXISTS blocks_au;
(defn- get-snippet-result
[snippet]
(let [;; Cut snippet to limited size chars for non-matched results
flag-highlight (str snippet-highlight-start " ")
snippet (if (string/includes? snippet flag-highlight)
snippet (if (string/includes? snippet snippet-highlight-start)
snippet
(snippet-by snippet max-snippet-length))]
snippet))
@@ -159,89 +161,121 @@ DROP TRIGGER IF EXISTS blocks_au;
[q]
(->> (string/split (string/trim q) #"\s+")
(remove string/blank?)
(remove #(contains? #{"and" "or" "not"} (string/lower-case %)))))
(remove #(contains? query-boolean-operators (string/lower-case %)))))
(defn- break-char?
[ch]
(or (boolean (re-find #"\s" (str ch)))
(contains? #{\, \. \! \? \ \。 \ \ \、} ch)))
(defn- find-matches
([text terms]
(find-matches text terms <))
([text terms sort-fn]
(->> terms
(keep (fn [term]
(when-let [idx (string/index-of (string/lower-case text) (string/lower-case term))]
{:term term :idx idx :len (count term) :end (+ idx (count term))})))
(sort-by :idx sort-fn))))
(defn- find-break-before
[s start end]
(loop [i (dec end)]
(if (< i start)
nil
(if (break-char? (.charAt s i))
(if (contains? query-break-chars (.charAt s i))
i
(recur (dec i))))))
(defn- snippet-window-around
[text idx match-len]
[text idx match-len window-len]
(let [text-len (count text)
prefix-len (min snippet-prefix-length text-len)
window-len (max 0 (- max-snippet-length prefix-len (count snippet-ellipsis)))
window-start (max 0 (- idx (quot window-len 2)))
min-end (min text-len (+ idx match-len))
window-end (min text-len (max min-end (+ window-start window-len)))
break-idx (find-break-before text window-start idx)
snippet-start (min window-end (max window-start (if break-idx (inc break-idx) window-start)))
snippet-end (min text-len (+ snippet-start window-len))
prefix (subs text 0 prefix-len)
snippet (subs text snippet-start snippet-end)]
{:prefix prefix
:snippet snippet
:snippet-start snippet-start
:snippet-end snippet-end
:prefix-len prefix-len}))
;; (prn :debug {:snippet snippet :snippet-start snippet-start :snippet-end snippet-end})
snippet))
(defn- find-best-match
[text terms]
(let [text-lc (string/lower-case text)]
(defn- highlight-terms
[text terms max-len]
(let [clipped-text (if (> (count text) max-len)
(subs text 0 max-len)
text)
matches (find-matches clipped-text terms >)]
(reduce
(fn [best term]
(let [term-lc (string/lower-case term)
idx (string/index-of text-lc term-lc)]
(cond
(nil? idx) best
(nil? best) {:term term :idx idx}
(< idx (:idx best)) {:term term :idx idx}
(and (= idx (:idx best))
(> (count term) (count (:term best))))
{:term term :idx idx}
:else best)))
nil
terms)))
(fn [acc {:keys [idx len]}]
(str (subs acc 0 idx)
snippet-highlight-start
(subs acc idx (+ idx len))
snippet-highlight-end
(subs acc (+ idx len))))
clipped-text
matches)))
(defn enough-highlighted?
[text num]
(loop [from 0
cnt 0]
(let [idx (string/index-of text snippet-highlight-start from)]
(cond
(nil? idx) false
(>= (inc cnt) num) true
:else (recur (+ idx (count snippet-highlight-start)) (inc cnt))))))
(defn ensure-highlighted-snippet
"Ensure snippet includes SQLite-style highlight markers. Uses `title` as a fallback
when snippet is missing or unhighlighted."
[snippet title q]
(let [base (or snippet title)
full-text (or title snippet)]
text (or title snippet)
terms (query->terms q)
expect-highlight-num (if (> (count terms) 2) 2 (count terms))]
;; (prn :debug {:snippet snippet :title title :q q :terms terms})
(cond
(string/blank? base) base
(string/blank? q) base
(string/includes? base snippet-highlight-start) base
(enough-highlighted? base expect-highlight-num) base
:else
(if-let [{:keys [term idx]} (and full-text (find-best-match full-text (query->terms q)))]
(let [use-window? (and (> (count full-text) max-snippet-length)
(>= idx max-snippet-length))
{:keys [prefix snippet snippet-start]} (when use-window?
(snippet-window-around full-text idx (count term)))
[target-text offset]
(if use-window?
[(str prefix snippet-ellipsis snippet)
(+ (count prefix) (count snippet-ellipsis) (- idx snippet-start))]
[base idx])
target-end (+ offset (count term))]
(if (and offset (<= 0 offset) (<= target-end (count target-text)))
(str (subs target-text 0 offset)
snippet-highlight-start
(subs target-text offset target-end)
snippet-highlight-end
(subs target-text target-end))
target-text))
base))))
(let [text (-> text
(string/replace snippet-highlight-start "")
(string/replace snippet-highlight-end ""))
matches (and text (find-matches text terms))]
(if (seq matches)
(let [prefix (subs text 0 (min snippet-prefix-length (count text)))
merged-window-len (max 0 (- max-snippet-length snippet-prefix-length (count snippet-ellipsis)))
split-window-len (max 0 (quot (- max-snippet-length snippet-prefix-length (* 2 (count snippet-ellipsis))) 2))
match-terms (map :term matches)
{:keys [term idx len end]} (first matches)
match-2 (second matches)
term-2 (:term match-2)
idx-2 (:idx match-2)
len-2 (:len match-2)
end-2 (:end match-2)
use-window? (and (> (count text) max-snippet-length)
(>= (or end-2 end) max-snippet-length))
close? (and end-2 (<= (- end-2 idx) snippet-merge-distance))
use-merge? (or (nil? idx-2) close?)]
;; (prn :debug {:matches matches :use-window? use-window? :close? close? :use-merge? use-merge?})
(if-not use-window?
(highlight-terms text terms max-snippet-length)
(if use-merge?
(let [snippet (snippet-window-around text idx len merged-window-len)]
(str prefix snippet-ellipsis (highlight-terms snippet match-terms merged-window-len)))
(let [snippet (snippet-window-around text idx len split-window-len)
snippet-2 (snippet-window-around text idx-2 len-2 split-window-len)
prefix-hit? (< idx (count prefix))
highlighted-prefix (if prefix-hit?
(highlight-terms prefix [term] snippet-prefix-length)
prefix)]
(if prefix-hit?
(str highlighted-prefix
snippet-ellipsis
(highlight-terms snippet-2 [term-2] split-window-len))
(str highlighted-prefix
snippet-ellipsis
(highlight-terms snippet [term] split-window-len)
snippet-ellipsis
(highlight-terms snippet-2 [term-2] split-window-len)))))))
base)))))
(defn- get-match-input
[q]

View File

@@ -7,8 +7,12 @@
(testing "adds highlight markers for first matching term"
(is (= "今天学习$pfts_2lqh>$中文$<pfts_2lqh$"
(search/ensure-highlighted-snippet nil "今天学习中文" "中文")))
(is (= "$pfts_2lqh>$今天$<pfts_2lqh$学习$pfts_2lqh>$中文$<pfts_2lqh$"
(search/ensure-highlighted-snippet "今天学习$pfts_2lqh>$中文$<pfts_2lqh$" nil "今天 中文")))
(is (= "Hello $pfts_2lqh>$World$<pfts_2lqh$"
(search/ensure-highlighted-snippet nil "Hello World" "world")))))
(search/ensure-highlighted-snippet nil "Hello World" "world")))
(is (= "$pfts_2lqh>$Hello$<pfts_2lqh$ Clojure $pfts_2lqh>$World$<pfts_2lqh$"
(search/ensure-highlighted-snippet "$pfts_2lqh>$Hello$<pfts_2lqh$ Clojure World" nil "hello world")))))
(deftest ensure-highlighted-snippet-keeps-existing
(testing "keeps snippet when already highlighted"
@@ -22,16 +26,62 @@
(deftest ensure-highlighted-snippet-windowed
(testing "keeps prefix and shows window around match"
(let [prefix (apply str (repeat 260 "甲"))
(let [prefix (apply str (apply str (repeat 10 "甲乙丙丁戊己庚辛壬癸,子丑寅卯辰巳午未申酉戌亥。")))
text (str prefix "Clojure是Lisp编程语言在Java平台上的现代、动态及函数式方言。")
result (search/ensure-highlighted-snippet nil text "函数式")
ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0"]
(is (string/starts-with? result (str (apply str (repeat 50 "甲")) ellipsis)))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0动态及\$pfts_2lqh>\$函数式\$<pfts_2lqh\$" result))
(is (not (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0函数式" result))))
(let [prefix (apply str (repeat 260 "A"))
(is (string/starts-with? result (str (subs prefix 0 50) ellipsis)))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0动态及\$pfts_2lqh>\$函数式\$<pfts_2lqh\$" result)))
(let [prefix (apply str (repeat 10 "ABCDEFG, HIJKLMN, OPQRST, UVWXYZ."))
text (str prefix "Clojure is a dynamic and functional dialect of the programming language Lisp on the Java platform.")
result (search/ensure-highlighted-snippet nil text "functional")
ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0"]
(is (string/starts-with? result (str (apply str (repeat 50 "A")) ellipsis)))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0\$pfts_2lqh>\$functional\$<pfts_2lqh\$" result)))))
(is (string/starts-with? result (str (subs prefix 0 50) ellipsis)))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0Clojure is a dynamic and \$pfts_2lqh>\$functional\$<pfts_2lqh\$" result)))))
(deftest ensure-highlighted-snippet-multi-term-merged
(testing "two terms within distance merge into one window"
(let [prefix (apply str (repeat 20 "甲乙丙丁戊己庚辛壬癸,子丑寅卯辰巳午未申酉戌亥。"))
text (str prefix "Clojure是Lisp编程语言在Java平台上的现代、动态及函数式方言。")
result (search/ensure-highlighted-snippet nil text "编程 函数式")
ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0"]
(is (string/starts-with? result (str (subs prefix 0 50) ellipsis)))
(is (re-find #"\$pfts_2lqh>\$编程\$<pfts_2lqh\$语言在Java平台上的现代、动态及\$pfts_2lqh>\$函数式\$<pfts_2lqh\$" result))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0" result))
(is (not (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0.*\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0" result))))
(let [prefix (apply str (repeat 20 "ABCDEFG, HIJKLMN, OPQRST, UVWXYZ."))
text (str prefix "Clojure is a dynamic and functional dialect of the programming language Lisp on the Java platform.")
result (search/ensure-highlighted-snippet nil text "dynamic language")
ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0"]
(is (string/starts-with? result (str (subs prefix 0 50) ellipsis)))
(is (re-find #"\$pfts_2lqh>\$dynamic\$<pfts_2lqh\$ and functional dialect of the programming \$pfts_2lqh>\$language\$<pfts_2lqh\$" result))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0" result))
(is (not (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0.*\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0" result))))))
(deftest ^:focus ensure-highlighted-snippet-multi-term-split
(testing "two terms far apart split into two windows"
(prn :test (search/ensure-highlighted-snippet nil "Logseq starts as a PKM that works directly with plain-text files, a lot of users believe that it's the best format for both longevity and cooperation with other editors. They might be worried about the database version." "Logseq database"))
(let [filler (apply str (repeat 20 "甲乙丙丁戊己庚辛壬癸,子丑寅卯辰巳午未申酉戌亥。"))
text (str "君不见黄河之水天上来," filler "奔流到海不复回")
result (search/ensure-highlighted-snippet nil text "黄河 到海")]
(is (string/starts-with? result "君不见$pfts_2lqh>$黄河$<pfts_2lqh$之水天上来,"))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0奔流\$pfts_2lqh>\$到海\$<pfts_2lqh\$" result))
(is (not (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0.*\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0" result))))
(let [prefix (apply str (repeat 20 "甲乙丙丁戊己庚辛壬癸,子丑寅卯辰巳午未申酉戌亥。"))
far (apply str (repeat 20 "甲乙丙丁戊己庚辛壬癸,子丑寅卯辰巳午未申酉戌亥。"))
text (str prefix "仙人抚我顶," far "结发受长生")
result (search/ensure-highlighted-snippet nil text "仙人 长生")
ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0"]
(is (string/starts-with? result (str (subs prefix 0 50) ellipsis)))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0\$pfts_2lqh>\$仙人\$<pfts_2lqh\$" result))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0结发受\$pfts_2lqh>\$长生\$<pfts_2lqh\$" result))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0.*\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0" result)))
(let [prefix (apply str (repeat 20 "ABCDEFG, HIJKLMN, OPQRST, UVWXYZ."))
far (apply str (repeat 20 "ABCDEFG, HIJKLMN, OPQRST, UVWXYZ."))
text (str prefix "life it seems will fade away, " far "now i will just say good-bye")
result (search/ensure-highlighted-snippet nil text "fade say")
ellipsis "\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0"]
(is (string/starts-with? result (str (subs prefix 0 50) ellipsis)))
(is (re-find #"\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0life it seems will \$pfts_2lqh>\$fade\$<pfts_2lqh\$" result))
(is (re-find #"\u00A0\u00A0\u00A0...\u00A0\u00A0\u00A0now i will just \$pfts_2lqh>\$say\$<pfts_2lqh\$" result))
(is (re-find #"\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0.*\u00A0\u00A0\u00A0\.\.\.\u00A0\u00A0\u00A0" result)))))