adjust weights for keyword search and semantic search

This commit is contained in:
Tienson Qin
2025-07-29 00:08:46 +08:00
parent 6e95bb1801
commit 74fb1194e9
3 changed files with 51 additions and 40 deletions

View File

@@ -1,8 +1,8 @@
(ns frontend.common.search-fuzzy
"fuzzy search. Used by frontend and worker namespaces"
(:require [clojure.string :as string]
(:require ["remove-accents" :as removeAccents]
[cljs-bean.core :as bean]
["remove-accents" :as removeAccents]))
[clojure.string :as string]))
(def MAX-STRING-LENGTH 1000.0)
@@ -28,10 +28,19 @@
(/ (- maxed mined)
maxed)))))
(defn search-normalize
"Normalize string for searching (loose)"
[s remove-accents?]
(when s
(let [normalize-str (.normalize (string/lower-case s) "NFKC")]
(if remove-accents?
(removeAccents normalize-str)
normalize-str))))
(defn score
[oquery ostr]
(let [query (clean-str oquery)
original-s (clean-str ostr)]
(let [query (-> (clean-str oquery) (search-normalize true))
original-s (-> (clean-str ostr) (search-normalize true))]
(loop [q (seq (char-array query))
s (seq (char-array original-s))
mult 1
@@ -68,24 +77,14 @@
(dec idx)
(- score' 0.1)))))))
(defn search-normalize
"Normalize string for searching (loose)"
[s remove-accents?]
(when s
(let [normalize-str (.normalize (string/lower-case s) "NFKC")]
(if remove-accents?
(removeAccents normalize-str)
normalize-str))))
(defn fuzzy-search
[data query & {:keys [limit extract-fn]
:or {limit 20}}]
(let [query (search-normalize query true)]
(->> (take limit
(sort-by :score (comp - compare)
(filter #(< 0 (:score %))
(for [item data]
(let [s (str (if extract-fn (extract-fn item) item))]
{:data item
:score (score query (search-normalize s true))})))))
(map :data))))
(->> (take limit
(sort-by :score (comp - compare)
(filter #(< 0 (:score %))
(for [item data]
(let [s (str (if extract-fn (extract-fn item) item))]
{:data item
:score (score query s)})))))
(map :data)))

View File

@@ -257,10 +257,11 @@
(js->clj (c.m/<? (.search infer-worker repo query-string nums-neighbors)) :keywordize-keys true))]
(->> (map vector distances neighbors)
(keep (fn [[distance label]]
(when-not (or (js/isNaN distance) (> distance 0.5))
(when-not (or (js/isNaN distance) (> distance 0.35))
(when-let [block (d/entity @conn label)]
{:block block
:distance distance})))))))))))
(when (:block/title block)
{:block block
:distance distance}))))))))))))
(def ^:private vector-search-state-flow
(m/eduction

View File

@@ -25,14 +25,21 @@
;; Configuration for re-ranking
(def config
{:keyword-weight 0.4
:semantic-weight 0.6})
{:keyword-weight 0.9
:semantic-weight 0.1})
(defn- log-score
[score]
(if (> score 2)
(js/Math.log score)
score))
;; Normalize scores to [0, 1] range using min-max normalization
(defn normalize-score [score min-score max-score]
(if (= min-score max-score)
0.0
(let [normalized (/ (- score min-score) (- max-score min-score))]
(let [normalized (/ (log-score (- score min-score))
(log-score (- max-score min-score)))]
(max 0.0 (min 1.0 normalized)))))
(defn- add-blocks-fts-triggers!
@@ -184,15 +191,16 @@ DROP TRIGGER IF EXISTS blocks_au;
:bind bind
:rowMode "array"}))
blocks (bean/->clj result)]
(map (fn [block]
(let [[id page title rank snippet] (if enable-snippet?
(update block 4 get-snippet-result)
block)]
{:id id
:keyword-score (when rank (Math/abs rank))
:page page
:title title
:snippet snippet})) blocks))
(keep (fn [block]
(let [[id page title rank snippet] (if enable-snippet?
(update block 4 get-snippet-result)
block)]
(when title
{:id id
:keyword-score (+ (fuzzy/score q title) (js/Math.abs rank))
:page page
:title title
:snippet snippet}))) blocks))
(catch :default e
(prn :debug "Search blocks failed: ")
(js/console.error e))))
@@ -320,9 +328,9 @@ DROP TRIGGER IF EXISTS blocks_au;
(* (:semantic-weight config) s-score)
(cond
(ldb/page? block)
1
0.001
(:block/tags block)
0.02
0.0005
:else
0))]
(merge result
@@ -368,7 +376,10 @@ DROP TRIGGER IF EXISTS blocks_au;
non-match-result (when (and (not page-only?) non-match-input)
(search-blocks-aux search-db non-match-sql q non-match-input page limit enable-snippet?))
;; fuzzy is too slow for large graphs
fuzzy-result (when-not (or page large-graph?) (fuzzy-search repo @conn q option))
fuzzy-result (when-not (or page large-graph?)
(->> (fuzzy-search repo @conn q option)
(map (fn [result]
(assoc result :keyword-score (fuzzy/score q (:title result)))))))
semantic-search-result* (m/? (embedding/task--search repo q 10))
semantic-search-result (->> semantic-search-result*
(map (fn [{:keys [block distance]}]