diff --git a/src/project/types/website/website-llms.ts b/src/project/types/website/website-llms.ts index 443006501b..78e7fcaed6 100644 --- a/src/project/types/website/website-llms.ts +++ b/src/project/types/website/website-llms.ts @@ -8,7 +8,7 @@ import { basename, join, relative } from "../../../deno_ral/path.ts"; import { existsSync } from "../../../deno_ral/fs.ts"; import { pathWithForwardSlashes } from "../../../core/path.ts"; -import { Document, Element } from "../../../core/deno-dom.ts"; +import { Document, Element, Node } from "../../../core/deno-dom.ts"; import { execProcess } from "../../../core/process.ts"; import { pandocBinaryPath, resourcePath } from "../../../core/resources.ts"; @@ -83,9 +83,36 @@ export function llmsHtmlFinalizer( // Convert HTML to markdown using Pandoc with the llms.lua filter await convertHtmlToLlmsMarkdown(htmlContent, llmsOutputPath); + + // Clean up conditional content markers from the original HTML doc + cleanupConditionalContent(doc); }; } +/** + * Clean up conditional content markers from the HTML document. + * - Remove llms-only content (should not appear in HTML output) + * - Unwrap llms-hidden markers (keep content, remove wrapper div) + */ +function cleanupConditionalContent(doc: Document): void { + // Remove llms-only content from HTML output + for (const el of doc.querySelectorAll(".llms-conditional-content")) { + (el as Element).remove(); + } + + // Unwrap llms-hidden markers (keep content, remove wrapper div) + for (const el of doc.querySelectorAll(".llms-hidden-content")) { + const parent = (el as Element).parentElement; + if (parent) { + const element = el as Element; + while (element.firstChild) { + parent.insertBefore(element.firstChild as Node, element as Node); + } + element.remove(); + } + } +} + /** * Extract the main content from an HTML document, removing navigation, * sidebars, footers, scripts, and styles. @@ -104,6 +131,7 @@ function extractMainContent(doc: Document): string { ".sidebar", ".quarto-search", "nav.navbar", + ".quarto-page-breadcrumbs", "script", "style", "link[rel='stylesheet']", @@ -128,6 +156,9 @@ function extractMainContent(doc: Document): string { return ""; } + // Preprocess annotated code blocks before converting to markdown + preprocessAnnotatedCodeBlocks(clone, main as Element); + // Return a minimal HTML document with just the content return ` @@ -138,6 +169,65 @@ ${main.innerHTML} `; } +/** + * Preprocess annotated code blocks for llms output. + * Restores original code text (with annotation markers) and converts + * the annotation definition list to an ordered list. + */ +function preprocessAnnotatedCodeBlocks(doc: Document, container: Element): void { + // Restore original code text in annotated code blocks. + // The llms-code-annotations.lua filter saves the original text + // (before code-annotation.lua strips markers) as a data attribute. + const annotated = container.querySelectorAll("[data-llms-code-original]"); + for (const node of annotated) { + const el = node as Element; + const originalText = el.getAttribute("data-llms-code-original"); + if (!originalText) continue; + + // The attribute is on the wrapper div; find the element inside + const codeEl = el.tagName === "CODE" + ? el + : el.querySelector("code") as Element | null; + if (codeEl) { + // Replace content with original (removes syntax highlighting spans + annotation buttons) + codeEl.textContent = originalText; + } + + el.removeAttribute("data-llms-code-original"); + } + + // Remove annotation gutter elements + const gutters = container.querySelectorAll( + ".code-annotation-gutter, .code-annotation-gutter-bg", + ); + for (const gutter of gutters) { + (gutter as Element).remove(); + } + + // Convert annotation definition lists to ordered lists. + // The annotation text is in
elements;
elements have just the number. + const dls = container.querySelectorAll("dl.code-annotation-container-grid"); + for (const dlNode of dls) { + const dl = dlNode as Element; + const ol = doc.createElement("ol"); + const dds = dl.querySelectorAll("dd"); + for (const ddNode of dds) { + const dd = ddNode as Element; + const li = doc.createElement("li"); + li.innerHTML = dd.innerHTML; + ol.appendChild(li); + } + + // Replace the DL (and its cell-annotation wrapper div if present) + const parent = dl.parentElement; + if (parent && parent.classList.contains("cell-annotation")) { + parent.parentElement?.replaceChild(ol, parent); + } else { + dl.parentElement?.replaceChild(ol, dl); + } + } +} + /** * Convert HTML content to markdown using Pandoc with the llms.lua filter. */ diff --git a/src/project/types/website/website.ts b/src/project/types/website/website.ts index 1e5a925221..56e24ac6c4 100644 --- a/src/project/types/website/website.ts +++ b/src/project/types/website/website.ts @@ -32,6 +32,7 @@ import { projectOffset, projectOutputDir } from "../../project-shared.ts"; import { isHtmlFileOutput } from "../../../config/format.ts"; import { + kFilterParams, kIncludeInHeader, kPageTitle, kTitle, @@ -358,6 +359,8 @@ export const websiteProjectType: ProjectType = { // Add llms.txt finalizer if enabled if (websiteConfigBoolean(kLlmsTxt, false, project.config)) { + extras[kFilterParams] = extras[kFilterParams] || {}; + extras[kFilterParams]["llms-txt"] = true; extras.html[kHtmlFinalizers]?.push( llmsHtmlFinalizer(source, project, format), ); diff --git a/src/resources/filters/llms/llms.lua b/src/resources/filters/llms/llms.lua index 752fd8cf80..916457c505 100644 --- a/src/resources/filters/llms/llms.lua +++ b/src/resources/filters/llms/llms.lua @@ -15,6 +15,7 @@ local skippable_classes = { ["quarto-float"] = true, ["quarto-float-fig"] = true, ["figure"] = true, + ["llms-conditional-content"] = true, } local droppable_classes = { ["navbar-container"] = true, @@ -25,6 +26,8 @@ local droppable_classes = { ["listing-categories"] = true, ["quarto-listing-category"] = true, -- category filter sidebar ["listing-category"] = true, -- individual category badges + ["quarto-page-breadcrumbs"] = true, -- breadcrumb navigation + ["llms-hidden-content"] = true, } local droppable_ids = { ["quarto-header"] = true, @@ -70,6 +73,53 @@ local function clean_element(el) end end +local function handle_tabset(div) + local titles = {} + local panes = {} + + -- Extract tab titles from the nav BulletList (first one in the div) + -- and tab pane contents from the tab-content Div + for _, block in ipairs(div.content) do + if block.t == "BulletList" and #titles == 0 then + for _, item in ipairs(block.content) do + for _, inner_block in ipairs(item) do + if inner_block.t == "Plain" or inner_block.t == "Para" then + for _, inline in ipairs(inner_block.content) do + if inline.t == "Link" then + table.insert(titles, inline.content) + break + end + end + end + end + end + elseif block.t == "Div" and block.classes:includes("tab-content") then + for _, inner in ipairs(block.content) do + if inner.t == "Div" and inner.classes:includes("tab-pane") then + table.insert(panes, inner.content) + end + end + end + end + + -- Build output: heading + content for each tab + local result = pandoc.Blocks({}) + for i = 1, math.max(#titles, #panes) do + if titles[i] then + result:insert(pandoc.Header(2, titles[i])) + end + if panes[i] then + result:extend(panes[i]) + end + end + + if #result > 0 then + return result + end + -- Fallback: return content as-is + return div.content +end + local function handle_callout(div) local kind = "NOTE" -- NOTE, TIP, IMPORTANT, WARNING, CAUTION div.classes:map(function(cls) @@ -140,8 +190,10 @@ function Link(link) return link.content end - if link.target and link.target:match("%.html$") then + if link.target and (link.target:match("%.html$") or link.target:match("%.html#")) then + link.target = link.target:gsub("%.html#", ".llms.md#") link.target = link.target:gsub("%.html$", ".llms.md") + link.target = link.target:gsub("^%./", "") if link.classes:includes("btn") then link.attr = pandoc.Attr() end @@ -174,6 +226,10 @@ end function Div(div) + if div.classes:includes("panel-tabset") then + return handle_tabset(div) + end + if div.classes:includes("callout") then return handle_callout(div) end diff --git a/src/resources/filters/main.lua b/src/resources/filters/main.lua index 3363754667..c34e9b0a97 100644 --- a/src/resources/filters/main.lua +++ b/src/resources/filters/main.lua @@ -143,6 +143,8 @@ import("./quarto-pre/bibliography-formats.lua") import("./quarto-pre/book-links.lua") import("./quarto-pre/book-numbering.lua") import("./quarto-pre/code-annotation.lua") +import("./quarto-pre/llms-code-annotations.lua") +import("./quarto-pre/llms-conditional-content.lua") import("./quarto-pre/code-filename.lua") import("./quarto-pre/contentsshortcode.lua") import("./quarto-pre/engine-escape.lua") @@ -321,6 +323,15 @@ local quarto_pre_filters = { traverser = 'jog', }, + { name = "pre-llms-conditional-content", + filter = filterIf( + function() return param("llms-txt", false) end, + llms_resolve_conditional_content() + ), + flags = { "has_conditional_content" }, + traverser = 'jog', + }, + { name = "pre-combined-hidden", filter = combineFilters({ hidden(), @@ -336,6 +347,15 @@ local quarto_pre_filters = { traverser = 'jog', }, + { name = "pre-llms-save-code-annotations", + filter = filterIf( + function() return param("llms-txt", false) end, + llms_save_code_annotations() + ), + flags = { "has_code_annotations" }, + traverser = 'jog', + }, + { name = "pre-code-annotations", filter = code_annotations(), flags = { "has_code_annotations" }, diff --git a/src/resources/filters/quarto-pre/llms-code-annotations.lua b/src/resources/filters/quarto-pre/llms-code-annotations.lua new file mode 100644 index 0000000000..1dd858abf9 --- /dev/null +++ b/src/resources/filters/quarto-pre/llms-code-annotations.lua @@ -0,0 +1,16 @@ +-- llms-code-annotations.lua +-- Copyright (C) 2020-2026 Posit Software, PBC +-- +-- Saves original CodeBlock text before code-annotation.lua strips markers. +-- Only runs when llms-txt is enabled (guarded by filterIf in main.lua). + +function llms_save_code_annotations() + return { + CodeBlock = function(el) + if el.text:match("<%d+>") then + el.attributes["data-llms-code-original"] = el.text + end + return el + end + } +end diff --git a/src/resources/filters/quarto-pre/llms-conditional-content.lua b/src/resources/filters/quarto-pre/llms-conditional-content.lua new file mode 100644 index 0000000000..a07ffb4921 --- /dev/null +++ b/src/resources/filters/quarto-pre/llms-conditional-content.lua @@ -0,0 +1,56 @@ +-- llms-conditional-content.lua +-- Copyright (C) 2020-2026 Posit Software, PBC +-- +-- Pre-filter that intercepts ConditionalBlock nodes referencing llms-txt +-- and replaces them with marker Divs so content can be included/excluded +-- from llms.md output independently of the HTML format. +-- Only runs when llms-txt is enabled (guarded by filterIf in main.lua). + +function llms_resolve_conditional_content() + -- Determine if a ConditionalBlock should be visible for llms-txt output. + -- Returns true (include), false (exclude), or nil (no llms-txt condition). + local function is_llms_visible(tbl) + local constants = require("modules/constants") + local function list_contains(list, value) + if not list then return false end + for _, v in ipairs(list) do + if v == value then return true end + end + return false + end + + local cond = tbl.condition + local has_when = list_contains(cond[constants.kWhenFormat], "llms-txt") + local has_unless = list_contains(cond[constants.kUnlessFormat], "llms-txt") + + if not has_when and not has_unless then return nil end + + if tbl.behavior == constants.kContentVisible then + -- content-visible when-format="llms-txt" -> include for llms + -- content-visible unless-format="llms-txt" -> exclude for llms + return has_when + else -- content-hidden + -- content-hidden when-format="llms-txt" -> exclude for llms + -- content-hidden unless-format="llms-txt" -> include for llms + return has_unless + end + end + + return { + ConditionalBlock = function(tbl) + local llms_visible = is_llms_visible(tbl) + if llms_visible == nil then return nil end + + local html_visible = is_visible(tbl) -- from content-hidden.lua + if llms_visible == html_visible then return nil end -- no intervention needed + + local div = tbl.original_node:clone() + if llms_visible then + div.classes:insert("llms-conditional-content") + else + div.classes:insert("llms-hidden-content") + end + return div + end + } +end diff --git a/tests/docs/smoke-all/website/llms-txt/_quarto.yml b/tests/docs/smoke-all/website/llms-txt/_quarto.yml index 04c824a03c..16837fcbd9 100644 --- a/tests/docs/smoke-all/website/llms-txt/_quarto.yml +++ b/tests/docs/smoke-all/website/llms-txt/_quarto.yml @@ -11,6 +11,11 @@ website: - href: index.qmd text: Home - about.qmd + sidebar: + contents: + - section: Info + contents: + - about.qmd format: html: diff --git a/tests/docs/smoke-all/website/llms-txt/about.qmd b/tests/docs/smoke-all/website/llms-txt/about.qmd index 7a3674f9cf..1da14ade37 100644 --- a/tests/docs/smoke-all/website/llms-txt/about.qmd +++ b/tests/docs/smoke-all/website/llms-txt/about.qmd @@ -6,9 +6,9 @@ _quarto: ensureLlmsMdExists: true ensureLlmsMdRegexMatches: # First array: patterns that MUST match - - ["^# About", "> \\*\\*NOTE:\\*\\*", "> \\*\\*WARNING:\\*\\*", "This is a note", "``` python", "def hello", "\\| Feature", "\\|[-]+\\|", "\\[home page\\]\\(.*\\.llms\\.md\\)"] - # Second array: patterns that must NOT match (no .html links in llms.md) - - ["\\.html\\)"] + - ["^# About", "> \\*\\*NOTE:\\*\\*", "> \\*\\*WARNING:\\*\\*", "This is a note", "``` python", "def hello", "\\| Feature", "\\|[-]+\\|", "\\[home page\\]\\(.*\\.llms\\.md\\)", "\\[test site intro\\]\\(index\\.llms\\.md#test-content\\)", "## Alpha Tab", "Alpha content here", "## Beta Tab", "Beta content here"] + # Second array: patterns that must NOT match (no .html links, no breadcrumbs, no empty tab links) + - ["\\.html\\)", "\\.html#", "\\[Info\\]", "\\[Alpha Tab\\]\\(\\)", "\\[Beta Tab\\]\\(\\)"] --- About this test site. @@ -39,6 +39,22 @@ def hello(): | Code | Working | | Tables | Working | +## Tabset Example + +::: {.panel-tabset} + +## Alpha Tab + +Alpha content here. + +## Beta Tab + +Beta content here. + +::: + ## Link Example Go back to the [home page](index.qmd). + +Go to the [test site intro](index.qmd#test-content). diff --git a/tests/docs/smoke-all/website/llms-txt/index.qmd b/tests/docs/smoke-all/website/llms-txt/index.qmd index 82a461a5b0..ca33b482e8 100644 --- a/tests/docs/smoke-all/website/llms-txt/index.qmd +++ b/tests/docs/smoke-all/website/llms-txt/index.qmd @@ -11,8 +11,36 @@ _quarto: - ["^# llms-txt Test Site", "^## Pages", "\\[.*\\]\\(.*\\.llms\\.md\\)"] # Second array: patterns that must NOT match (empty) - [] + ensureLlmsMdRegexMatches: + # First array: patterns that MUST match - verify anchor links, code annotations, and conditional content + - ["\\[callout examples\\]\\(about\\.llms\\.md#callout-examples\\)", "# <1>", "# <2>", "Load tidyverse", "Open help for ggplot", "only for LLM consumption"] + # Second array: patterns that must NOT match (no .html links, no annotation UI, no hidden content) + - ["\\.html\\)", "\\.html#", "code-annotation-anchor", "should not appear in LLM output"] --- +## Test Content + This is a test website for the llms-txt feature. See the [about page](about.qmd) for more information. + +Also see the [callout examples](about.qmd#callout-examples). + +## Conditional Content + +::: {.content-visible when-format="llms-txt"} +This content is only for LLM consumption. +::: + +::: {.content-hidden when-format="llms-txt"} +This content should not appear in LLM output. +::: + +## Code Annotations + +```r +library(tidyverse) # <1> +?ggplot # <2> +``` +1. Load tidyverse +2. Open help for ggplot