diff --git a/bun.lock b/bun.lock index 76602e8852..e8ff7adafb 100644 --- a/bun.lock +++ b/bun.lock @@ -430,6 +430,7 @@ "glob": "13.0.5", "google-auth-library": "10.5.0", "gray-matter": "4.0.3", + "htmlparser2": "8.0.2", "ignore": "7.0.5", "immer": "11.1.4", "jsonc-parser": "3.3.1", diff --git a/packages/opencode/package.json b/packages/opencode/package.json index b0427f231b..48cb7b3450 100644 --- a/packages/opencode/package.json +++ b/packages/opencode/package.json @@ -141,6 +141,7 @@ "glob": "13.0.5", "google-auth-library": "10.5.0", "gray-matter": "4.0.3", + "htmlparser2": "8.0.2", "ignore": "7.0.5", "immer": "11.1.4", "jsonc-parser": "3.3.1", diff --git a/packages/opencode/src/tool/webfetch.ts b/packages/opencode/src/tool/webfetch.ts index 8c2be44e99..f8a4b6233a 100644 --- a/packages/opencode/src/tool/webfetch.ts +++ b/packages/opencode/src/tool/webfetch.ts @@ -1,5 +1,6 @@ import { Effect, Schema } from "effect" import { HttpClient, HttpClientRequest } from "effect/unstable/http" +import { Parser } from "htmlparser2" import * as Tool from "./tool" import TurndownService from "turndown" import DESCRIPTION from "./webfetch.txt" @@ -139,8 +140,7 @@ export const WebFetchTool = Tool.define( case "text": if (contentType.includes("text/html")) { - const text = yield* Effect.promise(() => extractTextFromHTML(content)) - return { output: text, title, metadata: {} } + return { output: extractTextFromHTML(content), title, metadata: {} } } return { output: content, title, metadata: {} } @@ -155,35 +155,27 @@ export const WebFetchTool = Tool.define( }), ) -async function extractTextFromHTML(html: string) { +function extractTextFromHTML(html: string) { let text = "" - let skipContent = false + let skipDepth = 0 - const rewriter = new HTMLRewriter() - .on("script, style, noscript, iframe, object, embed", { - element() { - skipContent = true - }, - text() { - // Skip text content inside these elements - }, - }) - .on("*", { - element(element) { - // Reset skip flag when entering other elements - if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) { - skipContent = false - } - }, - text(input) { - if (!skipContent) { - text += input.text - } - }, - }) - .transform(new Response(html)) + const parser = new Parser({ + onopentag(name) { + if (skipDepth > 0 || ["script", "style", "noscript", "iframe", "object", "embed"].includes(name)) { + skipDepth++ + } + }, + ontext(input) { + if (skipDepth === 0) text += input + }, + onclosetag() { + if (skipDepth > 0) skipDepth-- + }, + }) + + parser.write(html) + parser.end() - await rewriter.text() return text.trim() } diff --git a/packages/opencode/test/tool/webfetch.test.ts b/packages/opencode/test/tool/webfetch.test.ts index 804c6bde29..fdf5210b9c 100644 --- a/packages/opencode/test/tool/webfetch.test.ts +++ b/packages/opencode/test/tool/webfetch.test.ts @@ -91,4 +91,23 @@ describe("tool.webfetch", () => { }), ), ) + + it.instance("extracts text from html without scripts or styles", () => + withFetch( + () => + new Response( + "Hello world", + { + status: 200, + headers: { "content-type": "text/html; charset=utf-8" }, + }, + ), + (url) => + Effect.gen(function* () { + const result = yield* exec({ url: new URL("/page.html", url).toString(), format: "text" }) + expect(result.output).toBe("Hello world") + expect(result.attachments).toBeUndefined() + }), + ), + ) })