enhance(mobile): auto-detect audio language

This commit is contained in:
Tienson Qin
2025-09-25 17:03:22 +08:00
parent 83474008ab
commit 13f510ca0c
4 changed files with 476 additions and 387 deletions

View File

@@ -8,6 +8,7 @@
import Capacitor
import Foundation
import Speech
import NaturalLanguage
func isDarkMode() -> Bool {
if #available(iOS 12.0, *) {
@@ -209,55 +210,185 @@ public class UILocalPlugin: CAPPlugin, CAPBridgedPlugin {
CAPPluginMethod(name: "transcribeAudio2Text", returnType: CAPPluginReturnPromise)
]
func recognizeSpeech(from file: URL, locale: String, completion: @escaping (String?, Error?) -> Void) {
if #available(iOS 26.0, *) {
// Modern API: SpeechTranscriber + SpeechAnalyzer
@available(iOS 26.0, *)
func recognizeWithAutoLocale(from file: URL,
completion: @escaping (String?, Error?) -> Void) {
Task {
do {
print("debug locale \(locale)")
// Step 1: pick supported locale
guard let supportedLocale = await SpeechTranscriber.supportedLocale(equivalentTo: Locale(identifier: locale)) else {
throw NSError(domain: "Speech", code: -1,
userInfo: [NSLocalizedDescriptionKey: "Unsupported locale"])
// ---------- STEP 1: Gather candidate locales ----------
let preferred = Array(Locale.preferredLanguages.prefix(3))
var candidateIDs = preferred
if !candidateIDs.contains(where: { $0.hasPrefix("en") }) {
candidateIDs.append("en-US")
}
if !candidateIDs.contains(where: { $0.hasPrefix("zh") }) {
candidateIDs.append("zh-CN")
}
// Step 2: transcriber with transcription preset
let transcriber = SpeechTranscriber(locale: supportedLocale, preset: .transcription)
// ---------- STEP 2: Probe candidates in parallel ----------
var results: [(Locale, String)] = []
// Ensure assets (downloads model if needed)
if let installRequest = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
try await installRequest.downloadAndInstall()
await withTaskGroup(of: (Locale, String).self) { group in
for id in candidateIDs {
let candidate = Locale(identifier: id)
if let supported = await SpeechTranscriber.supportedLocale(equivalentTo: candidate) {
group.addTask {
let text = (try? await self.quickSampleTranscription(file: file, locale: supported)) ?? ""
return (supported, text)
}
}
}
for await (locale, text) in group {
results.append((locale, text))
}
}
// Step 3: collect transcription results async
async let transcriptionFuture: String = try transcriber.results.reduce(into: "") { partial, result in
partial += String(result.text.characters) + " "
// ---------- STEP 3: Score results ----------
var bestLocale: Locale = Locale(identifier: "en-US")
var bestScore = Int.min
for (locale, text) in results {
let score = scoreTranscript(text, locale: locale)
print("📊 Candidate: \(locale.identifier), score: \(score), text: \(text)")
if score > bestScore {
bestScore = score
bestLocale = locale
}
}
print("🎙 Running full transcription with locale: \(bestLocale.identifier)")
// ---------- STEP 4: Full transcription ----------
let transcriber = SpeechTranscriber(locale: bestLocale, preset: .transcription)
if let req = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
try await req.downloadAndInstall()
print("✅ Model installed for \(bestLocale.identifier)")
}
let collectFullTask = Task { () -> String in
var full = ""
do {
for try await r in transcriber.results {
full += String(r.text.characters) + " "
}
} catch {}
return full
}
// Step 4: analyzer
let analyzer = SpeechAnalyzer(modules: [transcriber])
// Step 5/6: run analysis from file
let audioFile = try AVAudioFile(forReading: file)
if let lastSample = try await analyzer.analyzeSequence(from: audioFile) {
try await analyzer.finalizeAndFinish(through: lastSample)
let audio = try AVAudioFile(forReading: file)
if let last = try await analyzer.analyzeSequence(from: audio) {
try await analyzer.finalizeAndFinish(through: last)
} else {
try await analyzer.cancelAndFinishNow()
}
// Step 7/8: wait for transcription
let finalText = try await transcriptionFuture.trimmingCharacters(in: .whitespacesAndNewlines)
completion(finalText, nil)
let finalText = (await collectFullTask.value)
.trimmingCharacters(in: .whitespacesAndNewlines)
completion(finalText.isEmpty ? nil : finalText, nil)
} catch {
completion(nil, error)
}
}
}
@available(iOS 26.0, *)
private func quickSampleTranscription(file: URL, locale: Locale) async throws -> String {
let transcriber = SpeechTranscriber(locale: locale, preset: .transcription)
// Install models if needed (you could cache this across runs)
if let req = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
try await req.downloadAndInstall()
}
var sample = ""
var count = 0
let analyzer = SpeechAnalyzer(modules: [transcriber])
let collectTask = Task { () -> String in
do {
for try await r in transcriber.results {
sample += String(r.text.characters) + " "
count += 1
if count >= 3 {
// Early exit: stop once we have enough
try? await analyzer.cancelAndFinishNow()
break
}
}
} catch {}
return sample
}
let audioFile = try AVAudioFile(forReading: file)
if let last = try await analyzer.analyzeSequence(from: audioFile) {
try await analyzer.finalizeAndFinish(through: last)
} else {
try await analyzer.cancelAndFinishNow()
}
return await collectTask.value.trimmingCharacters(in: .whitespacesAndNewlines)
}
private func scoreTranscript(_ text: String, locale: Locale) -> Int {
// Normalize: keep only letters/digits/scripts (ignore punctuation)
let normalized = text.unicodeScalars.filter {
CharacterSet.letters.contains($0) ||
CharacterSet.decimalDigits.contains($0) ||
CharacterSet(charactersIn: "\u{4E00}"..."\u{9FFF}").contains($0) || // Han
CharacterSet(charactersIn: "\u{3040}"..."\u{30FF}").contains($0) || // Kana
CharacterSet(charactersIn: "\u{AC00}"..."\u{D7AF}").contains($0) || // Hangul
CharacterSet(charactersIn: "\u{0400}"..."\u{04FF}").contains($0) || // Cyrillic
CharacterSet(charactersIn: "\u{0600}"..."\u{06FF}").contains($0) || // Arabic
CharacterSet(charactersIn: "\u{0590}"..."\u{05FF}").contains($0) || // Hebrew
CharacterSet(charactersIn: "\u{0900}"..."\u{097F}").contains($0) // Devanagari
}
let coreText = String(String.UnicodeScalarView(normalized))
var score = coreText.count
// Detect script presence
let hasHan = coreText.range(of: #"\p{Han}"#, options: .regularExpression) != nil
let hasKana = coreText.range(of: #"\u3040-\u30FF"#, options: .regularExpression) != nil
let hasHangul = coreText.range(of: #"\uAC00-\uD7AF"#, options: .regularExpression) != nil
let hasCyrillic = coreText.range(of: #"\u0400-\u04FF"#, options: .regularExpression) != nil
let hasArabic = coreText.range(of: #"\u0600-\u06FF"#, options: .regularExpression) != nil
let hasHebrew = coreText.range(of: #"\u0590-\u05FF"#, options: .regularExpression) != nil
let hasDevanag = coreText.range(of: #"\u0900-\u097F"#, options: .regularExpression) != nil
// Latin ratio detection
let latinLetters = coreText.filter { $0.isASCII && $0.isLetter }.count
let latinRatio = coreText.isEmpty ? 0.0 : Double(latinLetters) / Double(coreText.count)
if latinRatio > 0.7 {
if locale.identifier.hasPrefix("en") {
score += 500
} else if locale.identifier.hasPrefix("zh")
|| locale.identifier.hasPrefix("ja")
|| locale.identifier.hasPrefix("ko") {
score -= 500
}
}
if hasHan { score += locale.identifier.hasPrefix("zh") || locale.identifier.hasPrefix("ja") ? 1000 : -500 }
if hasKana { score += locale.identifier.hasPrefix("ja") ? 1000 : -500 }
if hasHangul { score += locale.identifier.hasPrefix("ko") ? 1000 : -500 }
if hasCyrillic { score += locale.identifier.hasPrefix("ru") ? 1000 : -500 }
if hasArabic { score += locale.identifier.hasPrefix("ar") ? 1000 : -500 }
if hasHebrew { score += locale.identifier.hasPrefix("he") ? 1000 : -500 }
if hasDevanag { score += locale.identifier.hasPrefix("hi") ? 1000 : -500 }
// Bias toward user-preferred languages
if Locale.preferredLanguages.contains(where: { locale.identifier.hasPrefix($0.prefix(2)) }) {
score += 200
}
return score
}
@available(iOS 26.0, *)
@objc func transcribeAudio2Text(_ call: CAPPluginCall) {
self.call = call
@@ -267,11 +398,6 @@ public class UILocalPlugin: CAPPlugin, CAPBridgedPlugin {
return
}
guard let locale = call.getString("locale") else {
call.reject("invalid locale")
return
}
let audioData = Data(audioArray)
let fileURL = FileManager.default.temporaryDirectory.appendingPathComponent("recordedAudio.m4a")
@@ -287,7 +413,7 @@ public class UILocalPlugin: CAPPlugin, CAPBridgedPlugin {
return
}
self.recognizeSpeech(from: fileURL, locale: locale) { result, error in
self.recognizeWithAutoLocale(from: fileURL) { result, error in
if let result = result {
call.resolve(["transcription": result])
} else if let error = error {

View File

@@ -6,7 +6,6 @@
[logseq.common.util :as common-util]
[promesa.core :as p])))
(comment
#?(:cljs
(defn throw-err
[v]
@@ -14,7 +13,7 @@
(defmacro <?
[port]
`(throw-err (cljs.core.async/<! ~port))))
`(throw-err (cljs.core.async/<! ~port)))
#?(:cljs
(defn c->p

View File

@@ -1,16 +1,25 @@
(ns frontend.db.transact
"Provides async transact for use with ldb/transact!"
(:require [frontend.state :as state]
(:require [clojure.core.async :as async]
[clojure.core.async.interop :refer [p->c]]
[frontend.common.async-util :include-macros true :refer [<?]]
[frontend.state :as state]
[frontend.util :as util]
[lambdaisland.glogi :as log]
[logseq.outliner.op :as outliner-op]
[promesa.core :as p]))
(defn worker-call
[request-f]
(p/let [result (request-f)]
;; yields to ensure ui db to be updated before resolved
(p/delay 0)
result))
(let [response (p/deferred)]
(async/go
(let [result (<? (p->c (request-f)))]
(if (:ex-data result)
(do
(log/error :worker-request-failed result)
(p/reject! response result))
(p/resolve! response result))))
response))
(defn transact [worker-transact repo tx-data tx-meta]
(let [tx-meta' (assoc tx-meta

View File

@@ -11,7 +11,7 @@
[frontend.state :as state]
[frontend.util :as util]
[goog.functions :as gfun]
[logseq.client.logging :as log]
[lambdaisland.glogi :as log]
[logseq.shui.hooks :as hooks]
[logseq.shui.ui :as shui]
[mobile.init :as init]
@@ -20,7 +20,9 @@
[rum.core :as rum]))
(defonce audio-file-format "yyyy-MM-dd HH:mm:ss")
(def audio-length-limit 10) ; 10 minutes
(defonce *transcribe? (atom false))
(def *last-edit-block (atom nil))
(defn set-last-edit-block! [block] (reset! *last-edit-block block))
@@ -32,18 +34,6 @@
(str (.padStart (str minutes) 2 "0") ":"
(.padStart (str seconds) 2 "0"))))
(defn- get-locale
[]
(->
(p/let [^js lang (.getLanguageTag ^js Device)
value (.-value lang)]
(if (= value "en_CN")
"zh"
(string/replace value "-" "_")))
(p/catch (fn [e]
(log/error :get-locale-error e)
"en_US"))))
(defn- >ios-26
[]
(p/let [^js info (.getInfo ^js Device)
@@ -54,7 +44,7 @@
(and (= os "ios") (>= major 26))))
(defn save-asset-audio!
[blob locale]
[blob transcribe?]
(let [ext (some-> blob
(.-type)
(string/split ";")
@@ -76,11 +66,12 @@
[file]
{:last-edit-block @*last-edit-block})
asset-entity (first result)]
(when (and asset-entity (util/ios?))
(when (nil? asset-entity)
(log/error ::empty-asset-entity {}))
(when (and asset-entity transcribe?)
(p/let [buffer-data (.arrayBuffer blob)
unit8-data (js/Uint8Array. buffer-data)]
(-> (.transcribeAudio2Text mobile-util/ui-local #js {:audioData (js/Array.from unit8-data)
:locale locale})
(-> (.transcribeAudio2Text mobile-util/ui-local #js {:audioData (js/Array.from unit8-data)})
(p/then (fn [^js r]
(let [content (.-transcription r)]
(when-not (string/blank? content)
@@ -92,14 +83,16 @@
(p/catch #(log/error :transcribe-audio-error %)))))))))
(rum/defc record-button
[*locale]
[]
(let [*timer-ref (hooks/use-ref nil)
*save? (hooks/use-ref nil)
[*recorder _] (hooks/use-state (atom nil))
[locale set-locale!] (hooks/use-state nil)]
[*save? _] (hooks/use-state (atom nil))]
(hooks/use-effect!
(fn []
(when-not @*transcribe?
(p/let [transcribe? (>ios-26)]
(reset! *transcribe? transcribe?)))
(let [^js node (js/document.getElementById "wave-container")
^js wave-l (.querySelector node ".wave-left")
^js wave-r (.querySelector node ".wave-right")
@@ -121,8 +114,8 @@
(.start w1)
(.start w2)))
(.on "record-end" (fn [^js blob]
(when (true? (rum/deref *save?))
(save-asset-audio! blob @*locale))
(when @*save?
(save-asset-audio! blob @*transcribe?))
(mobile-state/close-popup!)))
(.on "record-progress" (gfun/throttle
(fn [time]
@@ -160,60 +153,22 @@
(shui/button {:variant :outline
:class "record-ctrl-btn rounded-full recording"
:on-click (fn []
(rum/set-ref! *save? true)
(reset! *save? true)
(.stopRecording ^js @*recorder))}
(shui/tabler-icon "player-stop" {:size 22}))]]
(when locale
(when-not (string/starts-with? locale "en_")
(shui/button {:variant :outline
:on-click (fn []
(reset! *locale "en_US")
(set-locale! "en_US"))}
"English transcribe")))]))
(shui/tabler-icon "player-stop" {:size 22}))]]]))
(rum/defc audio-recorder-aux < rum/static
[]
(let [[locale set-locale!] (hooks/use-state nil)
[system-locale set-system-locale!] (hooks/use-state nil)
[*locale] (hooks/use-state (atom nil))
[transcribe-supported? set-transcribe-supported!] (hooks/use-state false)]
(hooks/use-effect!
(fn []
(p/let [locale (get-locale)
transcribe-supported? >ios-26]
(set-transcribe-supported! transcribe-supported?)
(set-locale! locale)
(set-system-locale! locale)
(reset! *locale locale)))
[])
[:div.app-audio-recorder
[:div.flex.flex-row.justify-between.items-center.font-medium
[:div.opacity-70 (date/get-date-time-string (tl/local-now) {:formatter-str "yyyy-MM-dd"})]
(when transcribe-supported?
(if (and locale (not (string/starts-with? system-locale "en_")))
(let [en? (string/starts-with? locale "en_")]
(shui/button
{:variant (if en? :default :outline)
:class (str "rounded-full " (if en? "opacity-100" "opacity-70"))
:on-click (fn []
(reset! *locale "en_US")
(set-locale! "en_US"))}
"EN transcribe"))
;; hack: same height with en transcribe button
(shui/button
{:variant :outline
:class "rounded-full opacity-0"}
"EN transcribe")))]
[:div.opacity-70 (date/get-date-time-string (tl/local-now) {:formatter-str "yyyy-MM-dd"})]]
[:div#wave-container.app-wave-container
[:div.app-wave-needle]
[:div.wave-left]
[:div.wave-right.mirror]]
(record-button *locale)]))
(record-button)])
(defn- show-recorder
[]