From 74fb1194e9afd9ee6c86c49d41796ceaedb47dff Mon Sep 17 00:00:00 2001 From: Tienson Qin Date: Tue, 29 Jul 2025 00:08:46 +0800 Subject: [PATCH] adjust weights for keyword search and semantic search --- src/main/frontend/common/search_fuzzy.cljs | 43 +++++++++++----------- src/main/frontend/worker/embedding.cljs | 7 ++-- src/main/frontend/worker/search.cljs | 41 +++++++++++++-------- 3 files changed, 51 insertions(+), 40 deletions(-) diff --git a/src/main/frontend/common/search_fuzzy.cljs b/src/main/frontend/common/search_fuzzy.cljs index 94935d424c..d9f5de6875 100644 --- a/src/main/frontend/common/search_fuzzy.cljs +++ b/src/main/frontend/common/search_fuzzy.cljs @@ -1,8 +1,8 @@ (ns frontend.common.search-fuzzy "fuzzy search. Used by frontend and worker namespaces" - (:require [clojure.string :as string] + (:require ["remove-accents" :as removeAccents] [cljs-bean.core :as bean] - ["remove-accents" :as removeAccents])) + [clojure.string :as string])) (def MAX-STRING-LENGTH 1000.0) @@ -28,10 +28,19 @@ (/ (- maxed mined) maxed))))) +(defn search-normalize + "Normalize string for searching (loose)" + [s remove-accents?] + (when s + (let [normalize-str (.normalize (string/lower-case s) "NFKC")] + (if remove-accents? + (removeAccents normalize-str) + normalize-str)))) + (defn score [oquery ostr] - (let [query (clean-str oquery) - original-s (clean-str ostr)] + (let [query (-> (clean-str oquery) (search-normalize true)) + original-s (-> (clean-str ostr) (search-normalize true))] (loop [q (seq (char-array query)) s (seq (char-array original-s)) mult 1 @@ -68,24 +77,14 @@ (dec idx) (- score' 0.1))))))) -(defn search-normalize - "Normalize string for searching (loose)" - [s remove-accents?] - (when s - (let [normalize-str (.normalize (string/lower-case s) "NFKC")] - (if remove-accents? - (removeAccents normalize-str) - normalize-str)))) - (defn fuzzy-search [data query & {:keys [limit extract-fn] :or {limit 20}}] - (let [query (search-normalize query true)] - (->> (take limit - (sort-by :score (comp - compare) - (filter #(< 0 (:score %)) - (for [item data] - (let [s (str (if extract-fn (extract-fn item) item))] - {:data item - :score (score query (search-normalize s true))}))))) - (map :data)))) + (->> (take limit + (sort-by :score (comp - compare) + (filter #(< 0 (:score %)) + (for [item data] + (let [s (str (if extract-fn (extract-fn item) item))] + {:data item + :score (score query s)}))))) + (map :data))) diff --git a/src/main/frontend/worker/embedding.cljs b/src/main/frontend/worker/embedding.cljs index 1ab93da469..ce5a91a820 100644 --- a/src/main/frontend/worker/embedding.cljs +++ b/src/main/frontend/worker/embedding.cljs @@ -257,10 +257,11 @@ (js->clj (c.m/> (map vector distances neighbors) (keep (fn [[distance label]] - (when-not (or (js/isNaN distance) (> distance 0.5)) + (when-not (or (js/isNaN distance) (> distance 0.35)) (when-let [block (d/entity @conn label)] - {:block block - :distance distance}))))))))))) + (when (:block/title block) + {:block block + :distance distance})))))))))))) (def ^:private vector-search-state-flow (m/eduction diff --git a/src/main/frontend/worker/search.cljs b/src/main/frontend/worker/search.cljs index 41017b9c33..105ada5b31 100644 --- a/src/main/frontend/worker/search.cljs +++ b/src/main/frontend/worker/search.cljs @@ -25,14 +25,21 @@ ;; Configuration for re-ranking (def config - {:keyword-weight 0.4 - :semantic-weight 0.6}) + {:keyword-weight 0.9 + :semantic-weight 0.1}) + +(defn- log-score + [score] + (if (> score 2) + (js/Math.log score) + score)) ;; Normalize scores to [0, 1] range using min-max normalization (defn normalize-score [score min-score max-score] (if (= min-score max-score) 0.0 - (let [normalized (/ (- score min-score) (- max-score min-score))] + (let [normalized (/ (log-score (- score min-score)) + (log-score (- max-score min-score)))] (max 0.0 (min 1.0 normalized))))) (defn- add-blocks-fts-triggers! @@ -184,15 +191,16 @@ DROP TRIGGER IF EXISTS blocks_au; :bind bind :rowMode "array"})) blocks (bean/->clj result)] - (map (fn [block] - (let [[id page title rank snippet] (if enable-snippet? - (update block 4 get-snippet-result) - block)] - {:id id - :keyword-score (when rank (Math/abs rank)) - :page page - :title title - :snippet snippet})) blocks)) + (keep (fn [block] + (let [[id page title rank snippet] (if enable-snippet? + (update block 4 get-snippet-result) + block)] + (when title + {:id id + :keyword-score (+ (fuzzy/score q title) (js/Math.abs rank)) + :page page + :title title + :snippet snippet}))) blocks)) (catch :default e (prn :debug "Search blocks failed: ") (js/console.error e)))) @@ -320,9 +328,9 @@ DROP TRIGGER IF EXISTS blocks_au; (* (:semantic-weight config) s-score) (cond (ldb/page? block) - 1 + 0.001 (:block/tags block) - 0.02 + 0.0005 :else 0))] (merge result @@ -368,7 +376,10 @@ DROP TRIGGER IF EXISTS blocks_au; non-match-result (when (and (not page-only?) non-match-input) (search-blocks-aux search-db non-match-sql q non-match-input page limit enable-snippet?)) ;; fuzzy is too slow for large graphs - fuzzy-result (when-not (or page large-graph?) (fuzzy-search repo @conn q option)) + fuzzy-result (when-not (or page large-graph?) + (->> (fuzzy-search repo @conn q option) + (map (fn [result] + (assoc result :keyword-score (fuzzy/score q (:title result))))))) semantic-search-result* (m/? (embedding/task--search repo q 10)) semantic-search-result (->> semantic-search-result* (map (fn [{:keys [block distance]}]