mirror of
https://github.com/logseq/logseq.git
synced 2026-05-23 12:14:06 +00:00
feat(vec-search): debug ui support selecting model
This commit is contained in:
@@ -25,4 +25,8 @@ RTC won't start when major-schema-versions don't match"
|
||||
:logseq.kv/graph-backup-folder {:doc "Backup folder for automated backup feature"
|
||||
:rtc {:rtc/ignore-entity-when-init-upload true
|
||||
:rtc/ignore-entity-when-init-download true}}
|
||||
:logseq.kv/graph-initial-schema-version {:doc "Graph's schema version when created"})
|
||||
:logseq.kv/graph-initial-schema-version {:doc "Graph's schema version when created"}
|
||||
|
||||
:logseq.kv/graph-text-embedding-model-name {:doc "Graph's text-embedding model name"
|
||||
:rtc {:rtc/ignore-entity-when-init-upload true
|
||||
:rtc/ignore-entity-when-init-download true}})
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
[]
|
||||
(let [repo (state/get-current-repo)
|
||||
^js worker @db-browser/*worker
|
||||
[model-info set-model-info] (hooks/use-state nil)
|
||||
[vec-search-state set-vec-search-state] (hooks/use-state nil)
|
||||
[query-string set-query-string] (hooks/use-state nil)
|
||||
[result set-result] (hooks/use-state nil)]
|
||||
@@ -26,6 +27,18 @@
|
||||
vector-search-flows/vector-search-state-flow)
|
||||
::update-vec-search-state :succ (constantly nil)))
|
||||
[])
|
||||
(hooks/use-effect!
|
||||
(fn []
|
||||
(c.m/run-task
|
||||
(m/reduce
|
||||
(constantly nil)
|
||||
(m/ap
|
||||
(m/?> vector-search-flows/infer-worker-ready-flow)
|
||||
(let [model-info (ldb/read-transit-str (c.m/<? (.vec-search-embedding-model-info worker repo)))]
|
||||
(prn :model-info model-info)
|
||||
(set-model-info model-info))))
|
||||
::fetch-model-info :succ (constantly nil)))
|
||||
[])
|
||||
(hooks/use-effect!
|
||||
(fn []
|
||||
(c.m/run-task
|
||||
@@ -36,28 +49,48 @@
|
||||
:update-search-result :succ (constantly nil)))
|
||||
[(hooks/use-debounced-value query-string 200)])
|
||||
[:div
|
||||
[:b "State"]
|
||||
(let [state-map (get-in vec-search-state [:repo->index-info repo])]
|
||||
[:pre.select-text
|
||||
(with-out-str
|
||||
(fipp/pprint state-map {:width 10}))])
|
||||
(shui/button
|
||||
{:size :sm
|
||||
:class "mx-2"
|
||||
:on-click (fn [_] (.vec-search-embedding-stale-blocks worker repo))}
|
||||
"embedding-stale-blocks")
|
||||
(shui/button
|
||||
{:size :sm
|
||||
:class "mx-2"
|
||||
:on-click (fn [_] (.vec-search-re-embedding-graph-data worker repo))}
|
||||
"force-embedding-all-graph-blocks")
|
||||
(when (get-in vec-search-state [:repo->index-info repo :indexing?])
|
||||
(shui/button
|
||||
{:size :sm
|
||||
:class "mx-2"
|
||||
:on-click (fn [_] (.vec-search-cancel-indexing worker repo))}
|
||||
"cancel-current-indexing"))
|
||||
[:hr]
|
||||
[:b "Search:"]
|
||||
[:b "Actions"]
|
||||
[:div
|
||||
(shui/button
|
||||
{:size :sm
|
||||
:class "mx-2"
|
||||
:on-click (fn [_] (.vec-search-embedding-stale-blocks worker repo))}
|
||||
"embedding-stale-blocks")
|
||||
(shui/button
|
||||
{:size :sm
|
||||
:class "mx-2"
|
||||
:on-click (fn [_] (.vec-search-re-embedding-graph-data worker repo))}
|
||||
"force-embedding-all-graph-blocks")
|
||||
(when (get-in vec-search-state [:repo->index-info repo :indexing?])
|
||||
(shui/button
|
||||
{:size :sm
|
||||
:class "mx-2"
|
||||
:on-click (fn [_] (.vec-search-cancel-indexing worker repo))}
|
||||
"cancel-current-indexing"))]
|
||||
[:hr]
|
||||
[:b "Settings"]
|
||||
(shui/select
|
||||
{:on-value-change (fn [model-name]
|
||||
(c.m/run-task
|
||||
(m/sp
|
||||
(c.m/<? (.vec-search-load-model worker repo model-name)))
|
||||
::load-model :succ (constantly nil)))}
|
||||
(shui/select-trigger
|
||||
(shui/select-value
|
||||
{:placeholder "Select a model(need force-embedding-all-graph-blocks again)"}))
|
||||
(shui/select-content
|
||||
(shui/select-group
|
||||
(let [graph-text-embedding-model-name (:graph-text-embedding-model-name model-info)]
|
||||
(for [model-name (:available-model-names model-info)]
|
||||
(shui/select-item {:value model-name :disabled? (= graph-text-embedding-model-name model-name)} model-name))))))
|
||||
[:hr]
|
||||
[:b "Search"]
|
||||
[:input.form-input.my-2.py-1
|
||||
{:on-change (fn [e] (set-query-string (util/evalue e)))}]
|
||||
[:b "Search Result:"]
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
[frontend.fs.sync :as sync]
|
||||
[frontend.handler :as handler]
|
||||
[frontend.handler.db-based.rtc-background-tasks]
|
||||
[frontend.handler.db-based.vector-search-background-tasks]
|
||||
[frontend.handler.plugin :as plugin-handler]
|
||||
[frontend.handler.route :as route-handler]
|
||||
[frontend.log]
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
[frontend.db.restore :as db-restore]
|
||||
[frontend.error :as error]
|
||||
[frontend.handler.command-palette :as command-palette]
|
||||
[frontend.handler.db-based.vector-search-flows :as vector-search-flows]
|
||||
[frontend.handler.events :as events]
|
||||
[frontend.handler.file-based.events]
|
||||
[frontend.handler.file-based.file :as file-handler]
|
||||
@@ -180,7 +181,8 @@
|
||||
(log/info :webgpu-available? webgpu-available?)
|
||||
(when webgpu-available?
|
||||
(p/do! (db-browser/start-inference-worker!)
|
||||
(db-browser/<connect-db-worker-and-infer-worker!)))
|
||||
(db-browser/<connect-db-worker-and-infer-worker!)
|
||||
(reset! vector-search-flows/*infer-worker-ready true)))
|
||||
(when (util/electron?)
|
||||
(persist-db/run-export-periodically!))
|
||||
(when (mobile-util/native-platform?)
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
(ns frontend.handler.db-based.vector-search-background-tasks
|
||||
"Background tasks for vector-search"
|
||||
(:require [frontend.common.missionary :as c.m]
|
||||
[frontend.config :as config]
|
||||
[frontend.flows :as flows]
|
||||
[frontend.persist-db.browser :as db-browser]
|
||||
[frontend.state :as state]
|
||||
[missionary.core :as m]))
|
||||
|
||||
(defn- run-background-task-when-not-publishing
|
||||
[key' task]
|
||||
(when-not config/publishing?
|
||||
(c.m/run-background-task key' task)))
|
||||
|
||||
(run-background-task-when-not-publishing
|
||||
::init-load-model-when-switch-graph
|
||||
(m/reduce
|
||||
(constantly nil)
|
||||
(m/ap
|
||||
(m/?> flows/current-repo-flow)
|
||||
(when-let [^js worker @db-browser/*worker]
|
||||
(c.m/<? (.vec-search-init-embedding-model worker (state/get-current-repo)))))))
|
||||
@@ -3,6 +3,15 @@
|
||||
(:require [frontend.state :as state]
|
||||
[missionary.core :as m]))
|
||||
|
||||
;; input atoms
|
||||
(def *infer-worker-ready (atom nil))
|
||||
|
||||
(def infer-worker-ready-flow
|
||||
(m/eduction
|
||||
(filter some?)
|
||||
(take 1)
|
||||
(m/watch *infer-worker-ready)))
|
||||
|
||||
(def vector-search-state-flow
|
||||
(m/watch (:vector-search/state @state/state)))
|
||||
|
||||
|
||||
@@ -20,6 +20,14 @@
|
||||
[_this]
|
||||
(infer-worker.text-embedding/<init))
|
||||
|
||||
(load-model
|
||||
[_this model-name]
|
||||
(infer-worker.text-embedding/<load-model model-name))
|
||||
|
||||
(available-embedding-models
|
||||
[_]
|
||||
(clj->js (keys infer-worker.text-embedding/available-embedding-models)))
|
||||
|
||||
(set-db-worker-proxy
|
||||
[_this proxy]
|
||||
(reset! infer-worker.state/*db-worker proxy)
|
||||
|
||||
@@ -9,3 +9,4 @@
|
||||
(defonce *hnsw-index (atom {}))
|
||||
|
||||
(defonce *extractor (atom nil))
|
||||
(defonce *model-name+config (atom nil))
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
"text embedding fns"
|
||||
(:require ["@huggingface/transformers" :refer [pipeline]]
|
||||
["hnswlib-wasm" :refer [loadHnswlib]]
|
||||
[clojure.data :as data]
|
||||
[frontend.common.missionary :as c.m]
|
||||
[frontend.inference-worker.state :as infer-worker.state]
|
||||
[frontend.worker-common.util :as worker-util]
|
||||
@@ -9,9 +10,16 @@
|
||||
[missionary.core :as m]
|
||||
[promesa.core :as p]))
|
||||
|
||||
(add-watch infer-worker.state/*hnsw-index :delete-obj-when-dissoc
|
||||
(fn [_ _ o n]
|
||||
(let [[old-only] (data/diff o n)]
|
||||
(doseq [[repo ^js hnsw-index] old-only]
|
||||
(when hnsw-index
|
||||
(log/info :delete-hnsw-index repo)
|
||||
(.delete hnsw-index))))))
|
||||
|
||||
(def ^:private embedding-opts #js{"pooling" "mean" "normalize" true})
|
||||
|
||||
(def ^:private num-dimensions 384)
|
||||
(def ^:private init-max-elems 100)
|
||||
|
||||
(defn- split-into-chunks
|
||||
@@ -24,23 +32,37 @@
|
||||
(recur (+ i chunk-size))))
|
||||
result))
|
||||
|
||||
(defn- init-index
|
||||
(defn- init-index!
|
||||
[^js hnsw]
|
||||
(.initIndex hnsw init-max-elems 16 200 100)
|
||||
(.setEfSearch hnsw 64 ;;default 32
|
||||
))
|
||||
|
||||
(defn- ^js ensure-hnsw-index!
|
||||
(defn- ^js get-hnsw-index
|
||||
[repo]
|
||||
(or (@infer-worker.state/*hnsw-index repo)
|
||||
(let [hnsw-ctor (.-HierarchicalNSW ^js @infer-worker.state/*hnswlib)
|
||||
hnsw (new hnsw-ctor "cosine" num-dimensions "")
|
||||
hnsw (new hnsw-ctor "cosine" (or (:dims (:hnsw-config (second @infer-worker.state/*model-name+config))) 384) "")
|
||||
file-exists? (.checkFileExists (.-EmscriptenFileSystemManager ^js @infer-worker.state/*hnswlib) repo)]
|
||||
(if file-exists?
|
||||
(when file-exists?
|
||||
(.readIndex hnsw repo init-max-elems)
|
||||
(init-index hnsw))
|
||||
(swap! infer-worker.state/*hnsw-index assoc repo hnsw)
|
||||
(@infer-worker.state/*hnsw-index repo))))
|
||||
(swap! infer-worker.state/*hnsw-index assoc repo hnsw)
|
||||
hnsw))))
|
||||
|
||||
(defn- ^js new-hnsw-index!
|
||||
[repo]
|
||||
(when (get-hnsw-index repo)
|
||||
(swap! infer-worker.state/*hnsw-index dissoc repo))
|
||||
(let [hnsw-ctor (.-HierarchicalNSW ^js @infer-worker.state/*hnswlib)
|
||||
hnsw (new hnsw-ctor "cosine" (or (:dims (:hnsw-config (second @infer-worker.state/*model-name+config))) 384) "")]
|
||||
(init-index! hnsw)
|
||||
(swap! infer-worker.state/*hnsw-index assoc repo hnsw)
|
||||
hnsw))
|
||||
|
||||
(defn- model-loaded?
|
||||
[]
|
||||
(and @infer-worker.state/*extractor
|
||||
@infer-worker.state/*model-name+config))
|
||||
|
||||
(defn <text-embedding
|
||||
[text-array]
|
||||
@@ -64,20 +86,22 @@
|
||||
|
||||
(defn delete-items
|
||||
[repo labels]
|
||||
(.markDeleteItems ^js (ensure-hnsw-index! repo) (into-array labels)))
|
||||
(when-let [hnsw (get-hnsw-index repo)]
|
||||
(.markDeleteItems hnsw (into-array labels))))
|
||||
|
||||
(defn task--text-embedding&store!
|
||||
"return labels(js-array)"
|
||||
[repo text-array delete-labels replace-deleted?]
|
||||
(m/sp
|
||||
(let [{:keys [data _type dims _size]} (worker-util/profile :<text-embedding
|
||||
(c.m/<? (<text-embedding text-array)))
|
||||
data-coll (split-into-chunks data (last dims))
|
||||
_ (assert (= (count text-array) (count data-coll)))
|
||||
^js hnsw (ensure-hnsw-index! repo)]
|
||||
(when (seq delete-labels) (.markDeleteItems hnsw (into-array delete-labels)))
|
||||
(worker-util/profile (keyword "add-items" (str (alength data-coll)))
|
||||
(add-items hnsw data-coll replace-deleted?)))))
|
||||
(when (model-loaded?)
|
||||
(let [hnsw (or (get-hnsw-index repo) (new-hnsw-index! repo))
|
||||
{:keys [data _type dims _size]} (worker-util/profile :<text-embedding
|
||||
(c.m/<? (<text-embedding text-array)))
|
||||
data-coll (split-into-chunks data (last dims))
|
||||
_ (assert (= (count text-array) (count data-coll)))]
|
||||
(when (seq delete-labels) (.markDeleteItems hnsw (into-array delete-labels)))
|
||||
(worker-util/profile (keyword "add-items" (str (alength data-coll)))
|
||||
(add-items hnsw data-coll replace-deleted?))))))
|
||||
|
||||
(def ^:private write-index-wait-delays-flow
|
||||
(m/ap
|
||||
@@ -111,48 +135,62 @@
|
||||
Return synced? (bool)"
|
||||
[repo]
|
||||
(m/sp
|
||||
(let [hnsw (ensure-hnsw-index! repo)]
|
||||
(when-let [hnsw (get-hnsw-index repo)]
|
||||
(when-not (zero? (.getCurrentCount hnsw))
|
||||
(init-index hnsw)
|
||||
(init-index! hnsw)
|
||||
(m/? (task--write-index!* repo hnsw))))))
|
||||
|
||||
(defn task--write-index!
|
||||
[repo]
|
||||
(m/sp
|
||||
(let [hnsw (ensure-hnsw-index! repo)]
|
||||
(when-let [hnsw (get-hnsw-index repo)]
|
||||
(m/? (task--write-index!* repo hnsw)))))
|
||||
|
||||
(defn- search-knn
|
||||
[repo query-point num-neighbors]
|
||||
(let [^js hnsw (ensure-hnsw-index! repo)]
|
||||
(when-let [hnsw (get-hnsw-index repo)]
|
||||
(.searchKnn hnsw query-point num-neighbors nil)))
|
||||
|
||||
(defn <search-knn
|
||||
"return labels"
|
||||
[repo query-string num-neighbors]
|
||||
(p/let [query-embedding (<text-embedding #js[query-string])
|
||||
query-point (:data query-embedding)]
|
||||
(search-knn repo query-point num-neighbors)))
|
||||
(when (model-loaded?)
|
||||
(p/let [query-embedding (<text-embedding #js[query-string])
|
||||
query-point (:data query-embedding)]
|
||||
(search-knn repo query-point num-neighbors))))
|
||||
|
||||
(defn index-info
|
||||
[repo]
|
||||
(let [^js hnsw (ensure-hnsw-index! repo)]
|
||||
(when-let [hnsw (get-hnsw-index repo)]
|
||||
{:current-count (.getCurrentCount hnsw)
|
||||
:max-elements (.getMaxElements hnsw)
|
||||
:ef-search (.getEfSearch hnsw)
|
||||
:num-dims (.getNumDimensions hnsw)}))
|
||||
|
||||
(def available-embedding-models
|
||||
{"Xenova/all-MiniLM-L6-v2" {:tf-config {:dtype "fp32"}
|
||||
:hnsw-config {:dims 384}}
|
||||
"Xenova/jina-embeddings-v2-base-zh" {:tf-config {:dtype "fp32"}
|
||||
:hnsw-config {:dims 768}}})
|
||||
|
||||
(defn <load-model
|
||||
[model-name]
|
||||
(when-let [config (get available-embedding-models model-name)]
|
||||
(p/let [extractor (pipeline "feature-extraction" model-name
|
||||
(clj->js (-> (:tf-config config)
|
||||
(assoc "device" "webgpu")
|
||||
(assoc "progress_callback" #(log/info :progress %)))))]
|
||||
(reset! infer-worker.state/*extractor extractor)
|
||||
(reset! infer-worker.state/*model-name+config [model-name config])
|
||||
true)))
|
||||
|
||||
(defn <init
|
||||
[]
|
||||
(p/do!
|
||||
(p/let [hnswlib (loadHnswlib)]
|
||||
(reset! infer-worker.state/*hnswlib hnswlib)
|
||||
(.setDebugLogs (.-EmscriptenFileSystemManager ^js @infer-worker.state/*hnswlib) true)
|
||||
(log/info :loaded :hnswlib))
|
||||
(p/let [extractor (pipeline "feature-extraction" "Xenova/all-MiniLM-L6-v2" #js{"device" "webgpu" "dtype" "fp32"})]
|
||||
(reset! infer-worker.state/*extractor extractor)
|
||||
(log/info :loaded :extractor))))
|
||||
|
||||
(log/info :loaded :hnswlib))))
|
||||
|
||||
(comment
|
||||
(def repo "repo-1")
|
||||
|
||||
@@ -926,6 +926,20 @@
|
||||
dbs (ldb/read-transit-str r)]
|
||||
(p/all (map #(.unsafeUnlinkDB this (:name %)) dbs))))
|
||||
|
||||
;; vec-search ;;;
|
||||
(vec-search-embedding-model-info
|
||||
[this repo]
|
||||
(with-write-transit-str
|
||||
(embedding/task--embedding-model-info repo)))
|
||||
|
||||
(vec-search-init-embedding-model
|
||||
[this repo]
|
||||
(js/Promise. (embedding/task--init-embedding-model repo)))
|
||||
|
||||
(vec-search-load-model
|
||||
[this repo model-name]
|
||||
(js/Promise. (embedding/task--load-model repo model-name)))
|
||||
|
||||
(vec-search-embedding-stale-blocks
|
||||
[this repo]
|
||||
(embedding/embedding-stale-blocks! repo))
|
||||
|
||||
@@ -6,20 +6,21 @@
|
||||
[frontend.common.missionary :as c.m]
|
||||
[frontend.worker-common.util :as worker-util]
|
||||
[frontend.worker.state :as worker-state]
|
||||
[lambdaisland.glogi :as log]
|
||||
[logseq.common.config :as common-config]
|
||||
[logseq.db :as ldb]
|
||||
[medley.core :as medley]
|
||||
[missionary.core :as m]))
|
||||
|
||||
;;; TODOs:
|
||||
;;; - [x] add :logseq.property/description into text-to-embedding
|
||||
;;; - [ ] add tags to text-to-embedding
|
||||
;;; - [x] check webgpu available, transformers.js is slow without webgpu(the difference is ~70 times)
|
||||
;;; - [ ] expose index-state to ui
|
||||
;;; - [x] expose index-state to ui
|
||||
;;; - [ ] show progress when loading/downloading models
|
||||
|
||||
(def ^:private empty-vector-search-state
|
||||
{:repo->index-info {} ;; repo->index-info
|
||||
:repo->canceler {} ;; repo->canceler
|
||||
:repo->canceler {} ;; repo->canceler
|
||||
})
|
||||
|
||||
(def ^:private vector-search-state-keys (set (keys empty-vector-search-state)))
|
||||
@@ -98,7 +99,7 @@
|
||||
|
||||
(defn- labels-update-tx-data
|
||||
[db e+updated-at-coll added-labels]
|
||||
(assert (= (count e+updated-at-coll) (count added-labels)))
|
||||
(assert (= (count e+updated-at-coll) (count added-labels)) [e+updated-at-coll added-labels])
|
||||
(let [es (map first e+updated-at-coll)
|
||||
exist-es (set (keep
|
||||
(fn [b] (when (:block/uuid b) (:db/id b)))
|
||||
@@ -179,6 +180,34 @@
|
||||
:re-embedding-graph-data! :succ (constantly nil))]
|
||||
(reset-*vector-search-state! repo :canceler canceler))))
|
||||
|
||||
(defn task--embedding-model-info
|
||||
[repo]
|
||||
(m/sp
|
||||
(when-let [^js infer-worker @worker-state/*infer-worker]
|
||||
(let [available-model-names (c.m/<? (.available-embedding-models infer-worker))
|
||||
conn (worker-state/get-datascript-conn repo)
|
||||
embedding-model-name (ldb/get-key-value @conn :logseq.kv/graph-text-embedding-model-name)]
|
||||
{:available-model-names available-model-names
|
||||
:graph-text-embedding-model-name embedding-model-name}))))
|
||||
|
||||
(defn task--init-embedding-model
|
||||
[repo]
|
||||
(m/sp
|
||||
(when-let [^js infer-worker @worker-state/*infer-worker]
|
||||
(let [conn (worker-state/get-datascript-conn repo)]
|
||||
(if-let [embedding-model-name (ldb/get-key-value @conn :logseq.kv/graph-text-embedding-model-name)]
|
||||
(c.m/<? (.load-model infer-worker embedding-model-name))
|
||||
(log/info :init-load-model "model-name has not been set yet, skip"))))))
|
||||
|
||||
(defn task--load-model
|
||||
[repo model-name]
|
||||
(m/sp
|
||||
(when-let [^js infer-worker @worker-state/*infer-worker]
|
||||
(let [conn (worker-state/get-datascript-conn repo)]
|
||||
(when (c.m/<? (.load-model infer-worker model-name))
|
||||
(d/transact! conn [(ldb/kv :logseq.kv/graph-text-embedding-model-name model-name)])
|
||||
(log/info :loaded-model model-name))))))
|
||||
|
||||
(defn- remove-outdated-hnsw-label!
|
||||
[conn es]
|
||||
(when (seq es)
|
||||
|
||||
Reference in New Issue
Block a user