feat: init add frontend.inference-worker.text-embedding

This commit is contained in:
rcmerci
2025-03-13 22:07:30 +08:00
parent e2271dd03d
commit 95c0f2ef1c
23 changed files with 173 additions and 60 deletions

View File

@@ -584,7 +584,14 @@
:schema {:type :class
:cardinality :many
:public? true}
:queryable? true})))
:queryable? true}
:logseq.property.search/hnsw-label {:title "HNSW label"
:schema {:type :raw-number
:public? false
:hide? true}
:rtc {:rtc/ignore-attr-when-init-upload true
:rtc/ignore-attr-when-init-download true
:rtc/ignore-attr-when-syncing true}})))
(def built-in-properties
(->> built-in-properties*
@@ -647,7 +654,7 @@
"logseq.property.linked-references" "logseq.property.asset" "logseq.property.table" "logseq.property.node"
"logseq.property.code"
"logseq.property.journal" "logseq.property.class" "logseq.property.view"
"logseq.property.user" "logseq.property.history"})
"logseq.property.user" "logseq.property.history" "logseq.property.search"})
(defn logseq-property?
"Determines if keyword is a logseq property"

View File

@@ -37,7 +37,7 @@
(map (juxt :major :minor)
[(parse-schema-version x) (parse-schema-version y)])))
(def version (parse-schema-version "64.3"))
(def version (parse-schema-version "64.4"))
(defn major-version
"Return a number.

View File

@@ -144,6 +144,7 @@
"fuse.js": "6.4.6",
"grapheme-splitter": "1.0.4",
"graphology": "0.20.0",
"hnswlib-wasm": "^0.8.2",
"html2canvas": "^1.4.1",
"ignore": "5.1.8",
"inter-ui": "^3.19.3",

View File

@@ -1,10 +1,40 @@
(ns frontend.inference-worker.inference-worker
"Worker used for text embedding and vector-db"
(:require ["@huggingface/transformers" :refer [AutoTokenizer]]
[lambdaisland.glogi :as log]
[lambdaisland.glogi.console :as glogi-console]))
(:require ["comlink" :as Comlink]
[frontend.inference-worker.text-embedding :as infer-worker.text-embedding]
[lambdaisland.glogi.console :as glogi-console]
[logseq.db :as ldb]
[promesa.core :as p]
[shadow.cljs.modern :refer [defclass]]))
#_:clj-kondo/ignore
(defclass InferenceWorker
(extends js/Object)
(constructor
[this]
(super))
Object
(init
[_this]
(infer-worker.text-embedding/<init))
(text-embedding
[_this text-coll]
(p/let [output (infer-worker.text-embedding/<text-embedding text-coll)]
(ldb/write-transit-str output)))
(text-embedding+store!
[_this repo text-coll delete-labels]
(p/let [labels (infer-worker.text-embedding/<text-embedding&store! repo text-coll delete-labels)]
(ldb/write-transit-str labels)))
(search
[_this repo query-string nums-neighbors]
(infer-worker.text-embedding/<search-knn repo query-string nums-neighbors)))
(defn init
[]
(glogi-console/install!)
(log/info :init 1))
(let [^js obj #_{:clj-kondo/ignore [:unresolved-symbol]} (InferenceWorker.)]
(Comlink/expose obj)))

View File

@@ -0,0 +1,9 @@
(ns frontend.inference-worker.state
"State hub for inference-worker")
(defonce *hnswlib (atom nil))
;;repo -> index
(defonce *hnsw-index (atom {}))
(defonce *extractor (atom nil))

View File

@@ -0,0 +1,77 @@
(ns frontend.inference-worker.text-embedding
"text embedding fns"
(:require ["@huggingface/transformers" :refer [pipeline]]
["hnswlib-wasm" :refer [loadHnswlib]]
[frontend.inference-worker.state :as infer-worker.state]
[lambdaisland.glogi :as log]
[promesa.core :as p]))
(def ^:private embedding-opts #js{"pooling" "mean" "normalize" true})
(def ^:private num-dimensions 384)
(def ^:private init-max-elems 100)
(defn- split-into-chunks
[js-array chunk-size]
(let [length (alength js-array)
result (array)]
(loop [i 0]
(when (< i length)
(.push result (.slice js-array i (+ i chunk-size)))
(recur (+ i chunk-size))))
result))
(defn- ensure-hnsw-index!
[repo]
(or (@infer-worker.state/*hnsw-index repo)
(let [hnsw-ctor (.-HierarchicalNSW ^js @infer-worker.state/*hnswlib)
hnsw (new hnsw-ctor "l2" num-dimensions repo)
file-exists? (.checkFileExists (.-EmscriptenFileSystemManager ^js @infer-worker.state/*hnswlib) repo)]
(if file-exists?
(.readIndex hnsw repo init-max-elems)
(.initIndex hnsw init-max-elems 16 200 100))
(.setEfSearch hnsw 32)
(swap! infer-worker.state/*hnsw-index assoc repo hnsw)
(@infer-worker.state/*hnsw-index repo))))
(defn <text-embedding
[text-coll]
(p/let [text-coll (if (array? text-coll) text-coll (into-array text-coll))
^js r (._call ^js @infer-worker.state/*extractor text-coll embedding-opts)]
{:data (.-data r)
:type (.-type r)
:dims (.-dims r)
:size (.-size r)}))
(defn <text-embedding&store!
"return labels"
[repo text-coll delete-labels]
(p/let [{:keys [data _type dims _size]} (<text-embedding text-coll)
data-coll (split-into-chunks data (last dims))
_ (assert (= (count text-coll) (count data-coll)))
^js hnsw (ensure-hnsw-index! repo)]
(when (seq delete-labels) (.markDeleteItems hnsw (into-array delete-labels)))
(.addItems hnsw data-coll true)))
(defn- search-knn
[repo query-point num-neighbors]
(let [^js hnsw (ensure-hnsw-index! repo)]
(.searchKnn hnsw query-point num-neighbors nil)))
(defn <search-knn
"return labels"
[repo query-string num-neighbors]
(p/let [query-embedding (<text-embedding [query-string])
query-point (:data query-embedding)]
(search-knn repo query-point num-neighbors)))
(defn <init
[]
(p/do!
(p/let [hnswlib (loadHnswlib)]
(reset! infer-worker.state/*hnswlib hnswlib)
(.setDebugLogs (.-EmscriptenFileSystemManager ^js @infer-worker.state/*hnswlib) true)
(log/info :loaded :hnswlib))
(p/let [extractor (pipeline "feature-extraction" "Xenova/all-MiniLM-L6-v2" #js{"device" "webgpu"})]
(reset! infer-worker.state/*extractor extractor)
(log/info :loaded :extractor))))

View File

@@ -13,6 +13,7 @@
[frontend.persist-db.protocol :as protocol]
[frontend.state :as state]
[frontend.util :as util]
[lambdaisland.glogi :as log]
[logseq.db :as ldb]
[promesa.core :as p]))
@@ -154,7 +155,13 @@
(let [worker-url (if (util/electron?)
"js/inference-worker.js"
"static/js/inference-worker.js")
_worker (js/Worker. (str worker-url "?electron=" (util/electron?) "&publishing=" config/publishing?))])))
worker (js/Worker. (str worker-url "?electron=" (util/electron?) "&publishing=" config/publishing?))
wrapped-worker (Comlink/wrap worker)
t1 (util/time-ms)]
(reset! state/*infer-worker wrapped-worker)
(p/do!
(.init wrapped-worker)
(log/info "init infer-worker spent:" (str (- (util/time-ms) t1) "ms"))))))
(defn <export-db!
[repo data]

View File

@@ -33,6 +33,8 @@
(defonce *db-worker (atom nil))
(defonce *infer-worker (atom nil))
;; Stores main application state
(defonce ^:large-vars/data-var state
(let [document-mode? (or (storage/get :document/mode?) false)
@@ -52,7 +54,7 @@
:nfs/user-granted? {}
:nfs/refreshing? nil
:instrument/disabled? (storage/get "instrument-disabled")
;; TODO: how to detect the network reliably?
;; TODO: how to detect the network reliably?
:network/online? true
:indexeddb/support? true
:me nil
@@ -80,7 +82,7 @@
:ui/navigation-item-collapsed? {}
:ui/recent-pages (or (storage/get :ui/recent-pages) {})
;; right sidebar
;; right sidebar
:ui/handbooks-open? false
:ui/help-open? false
:ui/fullscreen? false
@@ -142,7 +144,7 @@
:editor/on-paste? (atom false)
:editor/last-key-code (atom nil)
:ui/global-last-key-code (atom nil)
:editor/block-op-type nil ;; :cut, :copy
:editor/block-op-type nil ;; :cut, :copy
:editor/block-refs (atom #{})
;; Stores deleted refed blocks, indexed by repo
@@ -231,7 +233,7 @@
:plugin/navs-settings? true
:plugin/focused-settings nil ;; plugin id
;; pdf
;; pdf
:pdf/system-win? false
:pdf/current nil
:pdf/ref-highlight nil

View File

@@ -1112,7 +1112,7 @@
(defn keyname [key] (str (namespace key) "/" (name key)))
;; FIXME: drain-chan was copied from frontend.worker.util due to shadow-cljs compile bug
;; FIXME: drain-chan was copied from frontend.worker-common.util due to shadow-cljs compile bug
#?(:cljs
(defn drain-chan
"drop all stuffs in CH, and return all of them"

View File

@@ -757,7 +757,8 @@
["64.2" {:properties [:logseq.property.view/feature-type]
:fix migrate-views}]
["64.3" {:properties [:logseq.property/used-template :logseq.property/template-applied-to]
:classes [:logseq.class/Template]}]])
:classes [:logseq.class/Template]}]
["64.4" {:properties [:logseq.property.search/hnsw-label]}]])
(let [[major minor] (last (sort (map (comp (juxt :major :minor) db-schema/parse-schema-version first)
schema-version->updates)))

View File

@@ -1,6 +1,6 @@
(ns frontend.worker.db.validate
"Validate db"
(:require [frontend.worker.util :as worker-util]
(:require [frontend.worker-common.util :as worker-util]
[logseq.db.frontend.validate :as db-validate]))
(defn validate-db

View File

@@ -2,10 +2,10 @@
"Db listeners for worker-db."
(:require [cljs-bean.core :as bean]
[datascript.core :as d]
[frontend.worker-common.util :as worker-util]
[frontend.worker.pipeline :as worker-pipeline]
[frontend.worker.search :as search]
[frontend.worker.state :as worker-state]
[frontend.worker.util :as worker-util]
[logseq.common.util :as common-util]
[logseq.outliner.batch-tx :as batch-tx]
[promesa.core :as p]))

View File

@@ -1,6 +1,6 @@
(ns frontend.worker.db-metadata
"Fns to read/write metadata.edn file for db-based."
(:require [frontend.worker.util :as worker-util]
(:require [frontend.worker-common.util :as worker-util]
[promesa.core :as p]))
(defn <store

View File

@@ -9,6 +9,7 @@
[datascript.core :as d]
[datascript.storage :refer [IStorage] :as storage]
[frontend.common.file.core :as common-file]
[frontend.worker-common.util :as worker-util]
[frontend.worker.crypt :as worker-crypt]
[frontend.worker.db-listener :as db-listener]
[frontend.worker.db-metadata :as worker-db-metadata]
@@ -26,8 +27,8 @@
[frontend.worker.search :as search]
[frontend.worker.state :as worker-state] ;; [frontend.worker.undo-redo :as undo-redo]
[frontend.worker.undo-redo2 :as undo-redo]
[frontend.worker.util :as worker-util]
[goog.object :as gobj]
[lambdaisland.glogi :as log]
[lambdaisland.glogi.console :as glogi-console]
[logseq.common.config :as common-config]
[logseq.common.util :as common-util]
@@ -60,8 +61,8 @@
[graph]
(when-not @*publishing?
(or (worker-state/get-opfs-pool graph)
(p/let [^js pool (.installOpfsSAHPoolVfs @*sqlite #js {:name (worker-util/get-pool-name graph)
:initialCapacity 20})]
(p/let [^js pool (.installOpfsSAHPoolVfs ^js @*sqlite #js {:name (worker-util/get-pool-name graph)
:initialCapacity 20})]
(swap! *opfs-pools assoc graph pool)
pool))))
@@ -396,9 +397,7 @@
db-dirs (filter (fn [file]
(string/starts-with? (.-name file) ".logseq-pool-"))
current-dir-dirs)]
(prn :debug
:db-dirs (map #(.-name %) db-dirs)
:all-dirs (map #(.-name %) current-dir-dirs))
(log/info :db-dirs (map #(.-name %) db-dirs) :all-dirs (map #(.-name %) current-dir-dirs))
(p/all (map (fn [dir]
(p/let [graph-name (-> (.-name dir)
(string/replace-first ".logseq-pool-" "")

View File

@@ -9,8 +9,8 @@
[frontend.common.async-util :as async-util]
[frontend.common.file.core :as common-file]
[frontend.common.file.util :as wfu]
[frontend.worker-common.util :as worker-util]
[frontend.worker.state :as worker-state]
[frontend.worker.util :as worker-util]
[goog.object :as gobj]
[lambdaisland.glogi :as log]
[logseq.common.date :as common-date]

View File

@@ -1,11 +1,11 @@
(ns frontend.worker.pipeline
"Pipeline work after transaction"
(:require [datascript.core :as d]
[frontend.worker-common.util :as worker-util]
[frontend.worker.commands :as commands]
[frontend.worker.file :as file]
[frontend.worker.react :as worker-react]
[frontend.worker.state :as worker-state]
[frontend.worker.util :as worker-util]
[logseq.common.defkeywords :refer [defkeywords]]
[logseq.db :as ldb]
[logseq.db.frontend.validate :as db-validate]

View File

@@ -3,6 +3,7 @@
(:require [clojure.data :as data]
[datascript.core :as d]
[frontend.common.missionary :as c.m]
[frontend.worker-common.util :as worker-util]
[frontend.worker.device :as worker-device]
[frontend.worker.rtc.asset :as r.asset]
[frontend.worker.rtc.branch-graph :as r.branch-graph]
@@ -16,7 +17,6 @@
[frontend.worker.rtc.ws :as ws]
[frontend.worker.rtc.ws-util :as ws-util :refer [gen-get-ws-create-map--memoized]]
[frontend.worker.state :as worker-state]
[frontend.worker.util :as worker-util]
[lambdaisland.glogi :as log]
[logseq.common.config :as common-config]
[logseq.db :as ldb]

View File

@@ -5,6 +5,7 @@
[clojure.set :as set]
[datascript.core :as d]
[frontend.common.missionary :as c.m]
[frontend.worker-common.util :as worker-util]
[frontend.worker.crypt :as crypt]
[frontend.worker.db-listener :as db-listener]
[frontend.worker.rtc.client-op :as client-op]
@@ -12,7 +13,6 @@
[frontend.worker.rtc.log-and-state :as rtc-log-and-state]
[frontend.worker.rtc.ws-util :as ws-util]
[frontend.worker.state :as worker-state]
[frontend.worker.util :as worker-util]
[logseq.db :as ldb]
[logseq.db.frontend.malli-schema :as db-malli-schema]
[logseq.db.frontend.schema :as db-schema]

View File

@@ -1,7 +1,7 @@
(ns frontend.worker.rtc.log-and-state
"Fns to generate rtc related logs"
(:require [frontend.common.missionary :as c.m]
[frontend.worker.util :as worker-util]
[frontend.worker-common.util :as worker-util]
[lambdaisland.glogi :as log]
[logseq.common.defkeywords :refer [defkeywords]]
[malli.core :as ma]

View File

@@ -4,6 +4,7 @@
[clojure.set :as set]
[clojure.string :as string]
[datascript.core :as d]
[frontend.worker-common.util :as worker-util]
[frontend.worker.handler.page :as worker-page]
[frontend.worker.rtc.asset :as r.asset]
[frontend.worker.rtc.client-op :as client-op]
@@ -11,7 +12,6 @@
[frontend.worker.rtc.log-and-state :as rtc-log-and-state]
[frontend.worker.rtc.malli-schema :as rtc-schema]
[frontend.worker.state :as worker-state]
[frontend.worker.util :as worker-util]
[lambdaisland.glogi :as log]
[logseq.clj-fractional-indexing :as index]
[logseq.common.defkeywords :refer [defkeywords]]

View File

@@ -2,8 +2,8 @@
"Validate skeleton data between server and client"
(:require [clojure.data :as data]
[datascript.core :as d]
[frontend.worker-common.util :as worker-util]
[frontend.worker.rtc.ws-util :as ws-util]
[frontend.worker.util :as worker-util]
[lambdaisland.glogi :as log]
[logseq.db :as ldb]
[logseq.db.frontend.schema :as db-schema]

View File

@@ -1,6 +1,6 @@
(ns frontend.worker.util
(ns frontend.worker-common.util
"Worker utils"
#?(:cljs (:require-macros [frontend.worker.util]))
#?(:cljs (:require-macros [frontend.worker-common.util]))
#?(:cljs (:refer-clojure :exclude [format]))
#?(:cljs (:require [clojure.string :as string]
[goog.crypt :as crypt]

View File

@@ -4895,6 +4895,11 @@ hmac-drbg@^1.0.1:
minimalistic-assert "^1.0.0"
minimalistic-crypto-utils "^1.0.1"
hnswlib-wasm@^0.8.2:
version "0.8.2"
resolved "https://registry.yarnpkg.com/hnswlib-wasm/-/hnswlib-wasm-0.8.2.tgz#8b6a9534d99f23d30b1fd29ac7c45410ee5941c1"
integrity sha512-qEgKETj4rMOYRA1esP0bxVosw9Wrz5S/HvjI2FBWOXG5rf5/Es4OoEWGVvztFihDNU5if61l6QGhW5ILtt+PqA==
hoist-non-react-statics@^3.3.2:
version "3.3.2"
resolved "https://registry.yarnpkg.com/hoist-non-react-statics/-/hoist-non-react-statics-3.3.2.tgz#ece0acaf71d62c2969c2ec59feff42a4b1a85b45"
@@ -9022,7 +9027,7 @@ streamroller@^3.1.5:
debug "^4.3.4"
fs-extra "^8.1.0"
"string-width-cjs@npm:string-width@^4.2.0":
"string-width-cjs@npm:string-width@^4.2.0", "string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.2, string-width@^4.2.3:
version "4.2.3"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
@@ -9040,15 +9045,6 @@ string-width@^1.0.1, string-width@^1.0.2:
is-fullwidth-code-point "^1.0.0"
strip-ansi "^3.0.0"
"string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.2, string-width@^4.2.3:
version "4.2.3"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
dependencies:
emoji-regex "^8.0.0"
is-fullwidth-code-point "^3.0.0"
strip-ansi "^6.0.1"
string-width@^2.0.0, string-width@^2.1.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-2.1.1.tgz#ab93f27a8dc13d28cac815c462143a6d9012ae9e"
@@ -9116,7 +9112,7 @@ string_decoder@~1.1.1:
dependencies:
safe-buffer "~5.1.0"
"strip-ansi-cjs@npm:strip-ansi@^6.0.1":
"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.0, strip-ansi@^6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
@@ -9137,13 +9133,6 @@ strip-ansi@^4.0.0:
dependencies:
ansi-regex "^3.0.0"
strip-ansi@^6.0.0, strip-ansi@^6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
dependencies:
ansi-regex "^5.0.1"
strip-ansi@^7.0.1:
version "7.1.0"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.1.0.tgz#d5b6568ca689d8561370b0707685d22434faff45"
@@ -10250,7 +10239,7 @@ wildcard@^2.0.1:
resolved "https://registry.yarnpkg.com/wildcard/-/wildcard-2.0.1.tgz#5ab10d02487198954836b6349f74fff961e10f67"
integrity sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==
"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0":
"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
@@ -10267,15 +10256,6 @@ wrap-ansi@^2.0.0:
string-width "^1.0.1"
strip-ansi "^3.0.1"
wrap-ansi@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
dependencies:
ansi-styles "^4.0.0"
string-width "^4.1.0"
strip-ansi "^6.0.0"
wrap-ansi@^8.1.0:
version "8.1.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214"