Files
TiddlyWiki5/plugins/tiddlywiki/freelinks/aho-corasick.js
s793016 b0d99f3bd3 Fix Freelinks Aho-Corasick: failure links, cache invalidation, longest-match, and Unicode safety (#9676)
* Update aho-corasick.js

 fix transition logic; ensure complete outputs (via failure-output merge); clean up stats/build scoping; clarify CJK boundary behavior.

* Update text.js

implement global longest-match priority with overlap suppression; fix refresh invalidation to ignore $:/state and drafts; handle deletions precisely to avoid rebuilding on draft deletion; add defensive check for cached automaton presence.

* Update text.js

remove comment

* Update aho-corasick.js

remove comment

* Create #9672.tid

* Create #2026-0222.tid

* Delete editions/tw5.com/tiddlers/releasenotes/5.4.0/#2026-0222.tid

* Update text.js

remove \"

* Update and rename #9672.tid to #9676.tid

change to right number

* Update #9397.tid

update the existing release note with the new PR link instead of creating a new release note.

* Delete editions/tw5.com/tiddlers/releasenotes/5.4.0/#9676.tid

update the existing release note with the new PR link instead of creating a new release note.

* Rename #9397.tid to #9676.tid

update the existing release note with the new PR link instead of creating a new release note.

* Update and rename #9676.tid to #9397.tid

add link

* Rename #9397.tid to #9676.tid

* Update tiddlywiki.info

add plugin for test build

* Update tiddlywiki.info

reverse change, ready to be merge.
2026-02-25 12:07:32 +01:00

191 lines
4.3 KiB
JavaScript

/*\
title: $:/core/modules/utils/aho-corasick.js
type: application/javascript
module-type: utils
Optimized Aho-Corasick string matching algorithm implementation with enhanced performance
and error handling for TiddlyWiki freelinking functionality.
- Uses WeakMap for failure links (required; plain object keys would collide).
- search() converts case per character to avoid Unicode index desync.
- Optional word boundary filtering: CJK always allowed; Latin requires non-word chars around.
\*/
"use strict";
function AhoCorasick() {
this.trie = {};
this.failure = new WeakMap();
this.maxFailureDepth = 100;
this.patternCount = 0;
}
AhoCorasick.prototype.addPattern = function(pattern, index) {
if(!pattern || typeof pattern !== "string" || pattern.length === 0) {
return;
}
var node = this.trie;
for(var i = 0; i < pattern.length; i++) {
var ch = pattern[i];
if(!node[ch]) {
node[ch] = {};
}
node = node[ch];
}
if(!node.$) {
node.$ = [];
}
node.$.push({
pattern: pattern,
index: index,
length: pattern.length
});
this.patternCount++;
};
AhoCorasick.prototype.buildFailureLinks = function() {
var queue = [];
var root = this.trie;
var self = this;
this.failure = new WeakMap();
this.failure.set(root, root);
for(var ch in root) {
if(ch === "$") continue;
if(root[ch] && typeof root[ch] === "object") {
this.failure.set(root[ch], root);
queue.push(root[ch]);
}
}
var processedNodes = 0;
var maxNodes = Math.max(100000, this.patternCount * 15);
while(queue.length > 0) {
if(processedNodes++ >= maxNodes) {
throw new Error("Aho-Corasick: buildFailureLinks exceeded maximum nodes (" + maxNodes + ")");
}
var node = queue.shift();
for(var edge in node) {
if(edge === "$") continue;
var child = node[edge];
if(!child || typeof child !== "object") continue;
var fail = self.failure.get(node) || root;
var depth = 0;
while(fail !== root && !fail[edge] && depth < self.maxFailureDepth) {
fail = self.failure.get(fail) || root;
depth++;
}
var nextFail = (fail[edge] && fail[edge] !== child) ? fail[edge] : root;
self.failure.set(child, nextFail);
if(nextFail.$) {
if(!child.$) child.$ = [];
child.$ = child.$.concat(nextFail.$);
}
queue.push(child);
}
}
};
AhoCorasick.prototype.search = function(text, useWordBoundary, ignoreCase) {
if(!text || typeof text !== "string" || text.length === 0) {
return [];
}
var matches = [];
var node = this.trie;
var root = this.trie;
var textLength = text.length;
var maxMatches = Math.min(textLength * 2, 10000);
for(var i = 0; i < textLength; i++) {
var ch = ignoreCase ? text[i].toLowerCase() : text[i];
while(node !== root && !node[ch]) {
node = this.failure.get(node) || root;
}
if(node[ch]) {
node = node[ch];
}
if(node.$) {
var outputs = node.$;
for(var j = 0; j < outputs.length && matches.length < maxMatches; j++) {
var out = outputs[j];
var matchStart = i - out.length + 1;
var matchEnd = i + 1;
if(matchStart < 0) continue;
if(useWordBoundary && !this.isWordBoundaryMatch(text, matchStart, matchEnd)) {
continue;
}
matches.push({
pattern: out.pattern,
index: matchStart,
length: out.length,
titleIndex: out.index
});
}
}
}
return matches;
};
AhoCorasick.prototype.isWordBoundaryMatch = function(text, start, end) {
var matchedText = text.substring(start, end);
if(/[\u3400-\u9FFF\uF900-\uFAFF]/.test(matchedText)) {
return true;
}
var beforeChar = start > 0 ? text[start - 1] : "";
var afterChar = end < text.length ? text[end] : "";
var isLatinWordChar = function(char) {
return /[a-zA-Z0-9_\u00C0-\u00FF]/.test(char);
};
return !isLatinWordChar(beforeChar) && !isLatinWordChar(afterChar);
};
AhoCorasick.prototype.clear = function() {
this.trie = {};
this.failure = new WeakMap();
this.patternCount = 0;
};
AhoCorasick.prototype.getStats = function() {
var nodeCount = 0;
function countNodes(node) {
if(!node) return;
nodeCount++;
for(var key in node) {
if(key === "$") continue;
if(node[key] && typeof node[key] === "object") {
countNodes(node[key]);
}
}
}
countNodes(this.trie);
return {
nodeCount: nodeCount,
patternCount: this.patternCount,
failureLinks: this.patternCount
};
};
exports.AhoCorasick = AhoCorasick;