mirror of
https://github.com/TiddlyWiki/TiddlyWiki5.git
synced 2026-05-04 09:47:02 +00:00
* Update aho-corasick.js False positive matches Symptom: Words like "it is", "Choose", "Set up" are incorrectly linked to tiddler "FooBar" when a tiddler titled "xxx x FooBar" exists. Root cause: The Aho-Corasick algorithm's output merging mechanism in buildFailureLinks caused failure link outputs to be incorrectly merged into intermediate nodes, resulting in false matches. Fix: Remove incorrect output merging in buildFailureLinks Implement proper output collection during search by traversing the failure link chain Add exact match validation: verify that the matched text exactly equals the pattern before accepting it Add cycle detection to prevent infinite loops in failure link traversal * Update text.js First character disappearing Symptom: When freelinking is enabled, the first character of matched words disappears (e.g., "The" becomes "he", "Filter" becomes "ilter"). Root cause: When the current tiddler's title was being filtered out, it was done too late in the process (during parse tree construction), causing text rendering issues. Fix: Move the current tiddler title filtering to the match validation stage (in processTextWithMatches) Use substring instead of slice for better stability Add proper case-insensitive comparison for title matching * Update text.js add back description * Update aho-corasick.js add back description * Update tiddlywiki.info add freelinks plugin for testing * Update tiddlywiki.info restore * Update tiddlywiki.info add freelinks plugin for test * Update aho-corasick.js erase comment * Update text.js erase comment * Update aho-corasick.js add back some commets * Update aho-corasick.js clean comment * change note #9397 change note #9397 * Update tiddlywiki.info reversed to original * Update #9397.tid update detail * Update #9397.tid another link added * Update #9397.tid add "release: 5.4.0" * Update #9397.tid some format modified
254 lines
7.0 KiB
JavaScript
254 lines
7.0 KiB
JavaScript
/*\
|
|
title: $:/core/modules/utils/aho-corasick.js
|
|
type: application/javascript
|
|
module-type: utils
|
|
|
|
Optimized Aho-Corasick string matching algorithm implementation with enhanced performance and error handling for TiddlyWiki freelinking functionality.
|
|
|
|
Useage:
|
|
|
|
Initialization:
|
|
Create an AhoCorasick instance: var ac = new AhoCorasick();
|
|
After initialization, the trie and failure structures are automatically created to store patterns and failure links.
|
|
|
|
Adding Patterns:
|
|
Call addPattern(pattern, index) to add a pattern, e.g., ac.addPattern("[[Link]]", 0);.
|
|
pattern is the string to match, and index is an identifier for tracking results.
|
|
Multiple patterns can be added, stored in the trie structure.
|
|
|
|
Building Failure Links:
|
|
Call buildFailureLinks() to construct failure links for efficient multi-pattern matching.
|
|
Includes a maximum node limit (default 100,000 or 15 times the pattern count) to prevent excessive computation.
|
|
|
|
Performing Search:
|
|
Use search(text, useWordBoundary) to find pattern matches in the text.
|
|
text is the input string, and useWordBoundary (boolean) controls whether to enforce word boundary checks.
|
|
Returns an array of match results, each containing pattern (matched pattern), index (start position), length (pattern length), and titleIndex (pattern identifier).
|
|
|
|
Word Boundary Check:
|
|
If useWordBoundary is true, only matches surrounded by non-word characters (letters, digits, or underscores) are returned.
|
|
|
|
Cleanup and Statistics:
|
|
Use clear() to reset the trie and failure links, freeing memory.
|
|
Use getStats() to retrieve statistics, including node count (nodeCount), pattern count (patternCount), and failure link count (failureLinks).
|
|
|
|
Notes
|
|
Performance Considerations: The Aho-Corasick trie may consume significant memory with a large number of patterns. Limit the number of patterns (e.g., <10,000) for optimal performance.
|
|
Error Handling: The module includes maximum node and failure depth limits (maxFailureDepth) to prevent infinite loops or memory overflow.
|
|
Word Boundary: Enabling useWordBoundary ensures more precise matches, ideal for link detection scenarios.
|
|
Compatibility: Ensure compatibility with other TiddlyWiki modules (e.g., wikiparser.js) when processing WikiText.
|
|
Debugging: Use getStats() to inspect the trie structure's size and ensure it does not overload browser memory.
|
|
|
|
\*/
|
|
|
|
"use strict";
|
|
|
|
function AhoCorasick() {
|
|
this.trie = {};
|
|
this.failure = {};
|
|
this.maxFailureDepth = 100;
|
|
this.patternCount = 0;
|
|
}
|
|
|
|
AhoCorasick.prototype.addPattern = function(pattern, index) {
|
|
if(!pattern || typeof pattern !== "string" || pattern.length === 0) {
|
|
return;
|
|
}
|
|
|
|
var node = this.trie;
|
|
|
|
for(var i = 0; i < pattern.length; i++) {
|
|
var char = pattern[i];
|
|
if(!node[char]) {
|
|
node[char] = {};
|
|
}
|
|
node = node[char];
|
|
}
|
|
|
|
if(!node.$) {
|
|
node.$ = [];
|
|
}
|
|
node.$.push({
|
|
pattern: pattern,
|
|
index: index,
|
|
length: pattern.length
|
|
});
|
|
|
|
this.patternCount++;
|
|
};
|
|
|
|
AhoCorasick.prototype.buildFailureLinks = function() {
|
|
var queue = [];
|
|
var root = this.trie;
|
|
this.failure[root] = root;
|
|
|
|
for(var char in root) {
|
|
if(root[char] && char !== "$") {
|
|
this.failure[root[char]] = root;
|
|
queue.push(root[char]);
|
|
}
|
|
}
|
|
|
|
var processedNodes = 0;
|
|
var maxNodes = Math.max(100000, this.patternCount * 15);
|
|
|
|
while(queue.length > 0 && processedNodes < maxNodes) {
|
|
var node = queue.shift();
|
|
processedNodes++;
|
|
|
|
for(var char in node) {
|
|
if(node[char] && char !== "$") {
|
|
var child = node[char];
|
|
var fail = this.failure[node];
|
|
var failureDepth = 0;
|
|
|
|
while(fail && !fail[char] && failureDepth < this.maxFailureDepth) {
|
|
fail = this.failure[fail];
|
|
failureDepth++;
|
|
}
|
|
|
|
var failureLink = (fail && fail[char]) ? fail[char] : root;
|
|
this.failure[child] = failureLink;
|
|
|
|
// Do not merge outputs from failure links during build
|
|
// Instead, collect matches dynamically by traversing failure links during search
|
|
|
|
queue.push(child);
|
|
}
|
|
}
|
|
}
|
|
|
|
if(processedNodes >= maxNodes) {
|
|
throw new Error("Aho-Corasick: buildFailureLinks exceeded maximum nodes (" + maxNodes + ")");
|
|
}
|
|
};
|
|
|
|
AhoCorasick.prototype.search = function(text, useWordBoundary) {
|
|
if(!text || typeof text !== "string" || text.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
var matches = [];
|
|
var node = this.trie;
|
|
var textLength = text.length;
|
|
var maxMatches = Math.min(textLength * 2, 10000);
|
|
|
|
for(var i = 0; i < textLength; i++) {
|
|
var char = text[i];
|
|
var transitionCount = 0;
|
|
|
|
// Follow failure links to find a valid transition
|
|
while(node && !node[char] && node !== this.trie && transitionCount < this.maxFailureDepth) {
|
|
node = this.failure[node] || this.trie;
|
|
transitionCount++;
|
|
}
|
|
|
|
if(node && node[char]) {
|
|
node = node[char];
|
|
} else {
|
|
node = this.trie;
|
|
if(this.trie[char]) {
|
|
node = this.trie[char];
|
|
}
|
|
}
|
|
|
|
// Traverse the current node and its failure link chain to gather all patterns
|
|
var currentNode = node;
|
|
var collectCount = 0;
|
|
var visitedNodes = new Set();
|
|
|
|
while(currentNode && collectCount < 10) {
|
|
// Prevent infinite loops
|
|
if(visitedNodes.has(currentNode)) {
|
|
break;
|
|
}
|
|
visitedNodes.add(currentNode);
|
|
|
|
// Only collect outputs from the current node (not merged ones)
|
|
if(currentNode.$) {
|
|
var outputs = currentNode.$;
|
|
for(var j = 0; j < outputs.length && matches.length < maxMatches; j++) {
|
|
var output = outputs[j];
|
|
var matchStart = i - output.length + 1;
|
|
var matchEnd = i + 1;
|
|
|
|
var matchedText = text.substring(matchStart, matchEnd);
|
|
if(matchedText !== output.pattern) {
|
|
continue;
|
|
}
|
|
|
|
if(useWordBoundary && !this.isWordBoundaryMatch(text, matchStart, matchEnd)) {
|
|
continue;
|
|
}
|
|
|
|
matches.push({
|
|
pattern: output.pattern,
|
|
index: matchStart,
|
|
length: output.length,
|
|
titleIndex: output.index
|
|
});
|
|
}
|
|
}
|
|
|
|
currentNode = this.failure[currentNode];
|
|
if(currentNode === this.trie) break;
|
|
collectCount++;
|
|
}
|
|
}
|
|
|
|
return matches;
|
|
};
|
|
|
|
AhoCorasick.prototype.isWordBoundaryMatch = function(text, start, end) {
|
|
var beforeChar = start > 0 ? text[start - 1] : "";
|
|
var afterChar = end < text.length ? text[end] : "";
|
|
|
|
var isWordChar = function(char) {
|
|
return /[a-zA-Z0-9_\u00C0-\u00FF]/.test(char);
|
|
};
|
|
|
|
var beforeIsWord = beforeChar && isWordChar(beforeChar);
|
|
var afterIsWord = afterChar && isWordChar(afterChar);
|
|
|
|
return !beforeIsWord && !afterIsWord;
|
|
};
|
|
|
|
AhoCorasick.prototype.clear = function() {
|
|
this.trie = {};
|
|
this.failure = {};
|
|
this.patternCount = 0;
|
|
};
|
|
|
|
AhoCorasick.prototype.getStats = function() {
|
|
var nodeCount = 0;
|
|
var patternCount = 0;
|
|
var failureCount = 0;
|
|
|
|
function countNodes(node) {
|
|
if(!node) return;
|
|
nodeCount++;
|
|
if(node.$) {
|
|
patternCount += node.$.length;
|
|
}
|
|
for(var key in node) {
|
|
if(node[key] && typeof node[key] === "object" && key !== "$") {
|
|
countNodes(node[key]);
|
|
}
|
|
}
|
|
}
|
|
|
|
countNodes(this.trie);
|
|
|
|
for(var key in this.failure) {
|
|
failureCount++;
|
|
}
|
|
|
|
return {
|
|
nodeCount: nodeCount,
|
|
patternCount: this.patternCount,
|
|
failureLinks: failureCount
|
|
};
|
|
};
|
|
|
|
exports.AhoCorasick = AhoCorasick;
|