mirror of
https://github.com/TiddlyWiki/TiddlyWiki5.git
synced 2026-04-29 18:46:52 +00:00
* Update aho-corasick.js fix transition logic; ensure complete outputs (via failure-output merge); clean up stats/build scoping; clarify CJK boundary behavior. * Update text.js implement global longest-match priority with overlap suppression; fix refresh invalidation to ignore $:/state and drafts; handle deletions precisely to avoid rebuilding on draft deletion; add defensive check for cached automaton presence. * Update text.js remove comment * Update aho-corasick.js remove comment * Create #9672.tid * Create #2026-0222.tid * Delete editions/tw5.com/tiddlers/releasenotes/5.4.0/#2026-0222.tid * Update text.js remove \" * Update and rename #9672.tid to #9676.tid change to right number * Update #9397.tid update the existing release note with the new PR link instead of creating a new release note. * Delete editions/tw5.com/tiddlers/releasenotes/5.4.0/#9676.tid update the existing release note with the new PR link instead of creating a new release note. * Rename #9397.tid to #9676.tid update the existing release note with the new PR link instead of creating a new release note. * Update and rename #9676.tid to #9397.tid add link * Rename #9397.tid to #9676.tid * Update tiddlywiki.info add plugin for test build * Update tiddlywiki.info reverse change, ready to be merge.
191 lines
4.3 KiB
JavaScript
191 lines
4.3 KiB
JavaScript
/*\
|
|
|
|
title: $:/core/modules/utils/aho-corasick.js
|
|
type: application/javascript
|
|
module-type: utils
|
|
|
|
Optimized Aho-Corasick string matching algorithm implementation with enhanced performance
|
|
and error handling for TiddlyWiki freelinking functionality.
|
|
|
|
- Uses WeakMap for failure links (required; plain object keys would collide).
|
|
- search() converts case per character to avoid Unicode index desync.
|
|
- Optional word boundary filtering: CJK always allowed; Latin requires non-word chars around.
|
|
|
|
\*/
|
|
|
|
"use strict";
|
|
|
|
function AhoCorasick() {
|
|
this.trie = {};
|
|
this.failure = new WeakMap();
|
|
this.maxFailureDepth = 100;
|
|
this.patternCount = 0;
|
|
}
|
|
|
|
AhoCorasick.prototype.addPattern = function(pattern, index) {
|
|
if(!pattern || typeof pattern !== "string" || pattern.length === 0) {
|
|
return;
|
|
}
|
|
var node = this.trie;
|
|
for(var i = 0; i < pattern.length; i++) {
|
|
var ch = pattern[i];
|
|
if(!node[ch]) {
|
|
node[ch] = {};
|
|
}
|
|
node = node[ch];
|
|
}
|
|
if(!node.$) {
|
|
node.$ = [];
|
|
}
|
|
node.$.push({
|
|
pattern: pattern,
|
|
index: index,
|
|
length: pattern.length
|
|
});
|
|
this.patternCount++;
|
|
};
|
|
|
|
AhoCorasick.prototype.buildFailureLinks = function() {
|
|
var queue = [];
|
|
var root = this.trie;
|
|
var self = this;
|
|
|
|
this.failure = new WeakMap();
|
|
this.failure.set(root, root);
|
|
|
|
for(var ch in root) {
|
|
if(ch === "$") continue;
|
|
if(root[ch] && typeof root[ch] === "object") {
|
|
this.failure.set(root[ch], root);
|
|
queue.push(root[ch]);
|
|
}
|
|
}
|
|
|
|
var processedNodes = 0;
|
|
var maxNodes = Math.max(100000, this.patternCount * 15);
|
|
|
|
while(queue.length > 0) {
|
|
if(processedNodes++ >= maxNodes) {
|
|
throw new Error("Aho-Corasick: buildFailureLinks exceeded maximum nodes (" + maxNodes + ")");
|
|
}
|
|
var node = queue.shift();
|
|
|
|
for(var edge in node) {
|
|
if(edge === "$") continue;
|
|
var child = node[edge];
|
|
if(!child || typeof child !== "object") continue;
|
|
|
|
var fail = self.failure.get(node) || root;
|
|
var depth = 0;
|
|
|
|
while(fail !== root && !fail[edge] && depth < self.maxFailureDepth) {
|
|
fail = self.failure.get(fail) || root;
|
|
depth++;
|
|
}
|
|
|
|
var nextFail = (fail[edge] && fail[edge] !== child) ? fail[edge] : root;
|
|
self.failure.set(child, nextFail);
|
|
|
|
if(nextFail.$) {
|
|
if(!child.$) child.$ = [];
|
|
child.$ = child.$.concat(nextFail.$);
|
|
}
|
|
|
|
queue.push(child);
|
|
}
|
|
}
|
|
};
|
|
|
|
AhoCorasick.prototype.search = function(text, useWordBoundary, ignoreCase) {
|
|
if(!text || typeof text !== "string" || text.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
var matches = [];
|
|
var node = this.trie;
|
|
var root = this.trie;
|
|
var textLength = text.length;
|
|
|
|
var maxMatches = Math.min(textLength * 2, 10000);
|
|
|
|
for(var i = 0; i < textLength; i++) {
|
|
var ch = ignoreCase ? text[i].toLowerCase() : text[i];
|
|
|
|
while(node !== root && !node[ch]) {
|
|
node = this.failure.get(node) || root;
|
|
}
|
|
if(node[ch]) {
|
|
node = node[ch];
|
|
}
|
|
|
|
if(node.$) {
|
|
var outputs = node.$;
|
|
for(var j = 0; j < outputs.length && matches.length < maxMatches; j++) {
|
|
var out = outputs[j];
|
|
var matchStart = i - out.length + 1;
|
|
var matchEnd = i + 1;
|
|
if(matchStart < 0) continue;
|
|
|
|
if(useWordBoundary && !this.isWordBoundaryMatch(text, matchStart, matchEnd)) {
|
|
continue;
|
|
}
|
|
|
|
matches.push({
|
|
pattern: out.pattern,
|
|
index: matchStart,
|
|
length: out.length,
|
|
titleIndex: out.index
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return matches;
|
|
};
|
|
|
|
AhoCorasick.prototype.isWordBoundaryMatch = function(text, start, end) {
|
|
var matchedText = text.substring(start, end);
|
|
|
|
if(/[\u3400-\u9FFF\uF900-\uFAFF]/.test(matchedText)) {
|
|
return true;
|
|
}
|
|
|
|
var beforeChar = start > 0 ? text[start - 1] : "";
|
|
var afterChar = end < text.length ? text[end] : "";
|
|
|
|
var isLatinWordChar = function(char) {
|
|
return /[a-zA-Z0-9_\u00C0-\u00FF]/.test(char);
|
|
};
|
|
|
|
return !isLatinWordChar(beforeChar) && !isLatinWordChar(afterChar);
|
|
};
|
|
|
|
AhoCorasick.prototype.clear = function() {
|
|
this.trie = {};
|
|
this.failure = new WeakMap();
|
|
this.patternCount = 0;
|
|
};
|
|
|
|
AhoCorasick.prototype.getStats = function() {
|
|
var nodeCount = 0;
|
|
function countNodes(node) {
|
|
if(!node) return;
|
|
nodeCount++;
|
|
for(var key in node) {
|
|
if(key === "$") continue;
|
|
if(node[key] && typeof node[key] === "object") {
|
|
countNodes(node[key]);
|
|
}
|
|
}
|
|
}
|
|
countNodes(this.trie);
|
|
|
|
return {
|
|
nodeCount: nodeCount,
|
|
patternCount: this.patternCount,
|
|
failureLinks: this.patternCount
|
|
};
|
|
};
|
|
|
|
exports.AhoCorasick = AhoCorasick;
|