pawamoy · pawamoy · Apr 8, 2025 · Mar 17, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/README.md b/README.md
@@ -22,30 +22,71 @@ pip install mkdocs-llmstxt
 Enable the plugin in `mkdocs.yml`:
 
 ```yaml title="mkdocs.yml"
+site_name: My project
+site_description: Description of my project.
+site_url: https://myproject.com/  # Required for the llmstxt plugin to work.
+
 plugins:
 - llmstxt:
-    files:
-    - output: llms.txt
-      inputs:
+    markdown_description: Long description of my project.
+    sections:
+      Usage documentation:
       - file1.md
-      - folder/file2.md
+      - file2.md
+```
+
+The resulting `/llms.txt` file will be available at the root of your documentation. With the previous example, it will be accessible at https://myproject.com/llms.txt and will contain the following:
+
+```markdown
+# My project
+
+> Description of my project.
+
+Long description of my project.
+
+## Usage documentation
+
+- [File1 title](https://myproject.com/file1.md)
+- [File2 title](https://myproject.com/file2.md)
 ```
 
-You can generate several files, each from its own set of input files.
+Each source file included in `sections` will have its own Markdown file available at the specified URL in the `/llms.txt`. See [Markdown generation](#markdown-generation) for more details.
 
 File globbing is supported:
 
 ```yaml title="mkdocs.yml"
 plugins:
 - llmstxt:
-    files:
-    - output: llms.txt
-      inputs:
-      - file1.md
-      - reference/*/*.md
+    sections:
+      Usage documentation:
+      - index.md
+      - usage/*.md
 ```
 
-The plugin will concatenate the rendered HTML of these input pages, clean it up a bit (with [BeautifulSoup](https://pypi.org/project/beautifulsoup4/)), convert it back to Markdown (with [Markdownify](https://pypi.org/project/markdownify)), and format it (with [Mdformat](https://pypi.org/project/mdformat)). By concatenating HTML instead of Markdown, we ensure that dynamically generated contents (API documentation, executed code blocks, snippets from other files, Jinja macros, etc.) are part of the generated text files. Credits to [Petyo Ivanov](https://github.com/petyosi) for the original idea ✨
+## Full output
+
+Although not explicitly written out in the https://llmstxt.org/ guidelines, it is common to output a `llms-full.txt` file with every page content expanded. This file can be generated by setting the `full_output` configuration value:
+
+```markdown
+plugins:
+- llmstxt:
+    full_output: llms-full.txt
+    sections:
+      Usage documentation:
+      - index.md
+      - usage/*.md
+```
+
+## Markdown generation
+
+To generate a Markdown page from a source file, the plugin will:
+
+- Cleanup the HTML output (with [BeautifulSoup](https://pypi.org/project/beautifulsoup4/))
+- Convert it back to Markdown (with [Markdownify](https://pypi.org/project/markdownify))
+
+Doing so is necessary to ensure that dynamically generated contents (API documentation, executed code blocks, snippets from other files, Jinja macros, etc.) are part of the generated text files.
+
+Credits to [Petyo Ivanov](https://github.com/petyosi) for the original idea ✨.
 
 You can disable auto-cleaning of the HTML:
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -133,11 +133,13 @@ plugins:
           signature_crossrefs: true
           summary: true
 - llmstxt:
-    files:
-    - output: llms-full.txt
-      inputs:
+    full_output: llms-full.txt
+    markdown_description: This plugin automatically generates llms.txt files.
+    sections:
+      Usage documentation:
       - index.md
-      - reference/**.md
+      API reference:
+      - reference/*.md
 - git-revision-date-localized:
     enabled: !ENV [DEPLOY, false]
     enable_creation_date: true

diff --git a/src/mkdocs_llmstxt/_internal/config.py b/src/mkdocs_llmstxt/_internal/config.py
@@ -6,16 +6,11 @@
 from mkdocs.config.base import Config as BaseConfig
 
 
-class _FileConfig(BaseConfig):
-    """Sub-config for each Markdown file."""
-
-    output = mkconf.Type(str)
-    inputs = mkconf.ListOfItems(mkconf.Type(str))
-
-
 class _PluginConfig(BaseConfig):
     """Configuration options for the plugin."""
 
     autoclean = mkconf.Type(bool, default=True)
     preprocess = mkconf.Optional(mkconf.File(exists=True))
-    files = mkconf.ListOfItems(mkconf.SubConfig(_FileConfig))
+    markdown_description = mkconf.Optional(mkconf.Type(str))
+    full_output = mkconf.Optional(mkconf.Type(str))
+    sections = mkconf.DictOfItems(mkconf.ListOfItems(mkconf.Type(str)))
diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py
@@ -3,18 +3,18 @@
 from __future__ import annotations
 
 import fnmatch
-from collections import defaultdict
 from itertools import chain
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, NamedTuple, cast
+from urllib.parse import urljoin
 
 import mdformat
 from bs4 import BeautifulSoup as Soup
 from bs4 import Tag
 from markdownify import ATX, MarkdownConverter
 from mkdocs.config.defaults import MkDocsConfig
-from mkdocs.exceptions import PluginError
 from mkdocs.plugins import BasePlugin
+from mkdocs.structure.pages import Page
 
 from mkdocs_llmstxt._internal.config import _PluginConfig
 from mkdocs_llmstxt._internal.logger import _get_logger
@@ -31,6 +31,13 @@
 _logger = _get_logger(__name__)
 
 
+class _MDPageInfo(NamedTuple):
+    title: str
+    path_md: Path
+    md_url: str
+    content: str
+
+
 class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
     """The MkDocs plugin to generate an `llms.txt` file.
 
@@ -46,9 +53,8 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
     mkdocs_config: MkDocsConfig
     """The global MkDocs configuration."""
 
-    def __init__(self) -> None:
-        self.html_pages: dict[str, dict[str, str]] = defaultdict(dict)
-        """Dictionary to store the HTML contents of pages."""
+    md_pages: dict[str, list[_MDPageInfo]]
+    """Dictionary mapping section names to a list of page infos."""
 
     def _expand_inputs(self, inputs: list[str], page_uris: list[str]) -> list[str]:
         expanded: list[str] = []
@@ -72,7 +78,12 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
         Returns:
             The same, untouched config.
         """
+        if config.site_url is None:
+            raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin")
         self.mkdocs_config = config
+        # A `defaultdict` could be used, but we need to retain the same order between `config.sections` and `md_pages`
+        # (which wouldn't be guaranteed when filling `md_pages` in `on_page_content()`).
+        self.md_pages = {section: [] for section in self.config.sections}
         return config
 
     def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None:  # noqa: ARG002
@@ -88,64 +99,130 @@ def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None:  # no
         Returns:
             Modified collection or none.
         """
-        for file in self.config.files:
-            file["inputs"] = self._expand_inputs(file["inputs"], page_uris=list(files.src_uris.keys()))
+        page_uris = list(files.src_uris)
+
+        for section_name, file_list in list(self.config.sections.items()):
+            self.config.sections[section_name] = self._expand_inputs(file_list, page_uris=page_uris)
+
         return files
 
     def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None:  # noqa: ARG002
-        """Record pages contents.
+        """Convert page content into a Markdown file and save the result to be processed in the `on_post_build` hook.
 
         Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
-        In this hook we simply record the HTML of the pages into a dictionary whose keys are the pages' URIs.
 
         Parameters:
             html: The rendered HTML.
             page: The page object.
         """
-        for file in self.config.files:
-            if page.file.src_uri in file["inputs"]:
-                _logger.debug(f"Adding page {page.file.src_uri} to page {file['output']}")
-                self.html_pages[file["output"]][page.file.src_uri] = html
+        for section_name, file_list in self.config.sections.items():
+            if page.file.src_uri in file_list:
+                path_md = Path(page.file.abs_dest_path).with_suffix(".md")
+                page_md = _generate_page_markdown(
+                    html,
+                    should_autoclean=self.config.autoclean,
+                    preprocess=self.config.preprocess,
+                    path=str(path_md),
+                )
+
+                md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix()
+                # Apply the same logic as in the `Page.url` property.
+                if md_url in (".", "./"):
+                    md_url = ""
+
+                # Guaranteed to exist as we require `site_url` to be configured.
+                base = cast("str", self.mkdocs_config.site_url)
+                if not base.endswith("/"):
+                    base += "/"
+                md_url = urljoin(base, md_url)
+
+                self.md_pages[section_name].append(
+                    _MDPageInfo(
+                        title=page.title if page.title is not None else page.file.src_uri,
+                        path_md=path_md,
+                        md_url=md_url,
+                        content=page_md,
+                    ),
+                )
+
         return html
 
-    def on_post_build(self, config: MkDocsConfig, **kwargs: Any) -> None:  # noqa: ARG002
-        """Combine all recorded pages contents and convert it to a Markdown file with BeautifulSoup and Markdownify.
+    def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None:  # noqa: ARG002
+        """Create the final `llms.txt` file and the MD files for all selected pages.
 
         Hook for the [`on_post_build` event](https://www.mkdocs.org/user-guide/plugins/#on_post_build).
-        In this hook we concatenate all previously recorded HTML, and convert it to Markdown using Markdownify.
 
         Parameters:
             config: MkDocs configuration.
         """
-
-        def language_callback(tag: Tag) -> str:
-            for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()):
-                if css_class.startswith("language-"):
-                    return css_class[9:]
-            return ""
-
-        converter = MarkdownConverter(
-            bullets="-",
-            code_language_callback=language_callback,
-            escape_underscores=False,
-            heading_style=ATX,
-        )
-
-        for file in self.config.files:
-            try:
-                html = "\n\n".join(self.html_pages[file["output"]][input_page] for input_page in file["inputs"])
-            except KeyError as error:
-                raise PluginError(str(error)) from error
-
-            soup = Soup(html, "html.parser")
-            if self.config.autoclean:
-                autoclean(soup)
-            if self.config.preprocess:
-                _preprocess(soup, self.config.preprocess, file["output"])
-
-            output_file = Path(config.site_dir).joinpath(file["output"])
-            output_file.parent.mkdir(parents=True, exist_ok=True)
-            markdown = mdformat.text(converter.convert_soup(soup), options={"wrap": "no"})
-            output_file.write_text(markdown, encoding="utf8")
-
-            _logger.info(f"Generated file /{file['output']}")
+        output_file = Path(config.site_dir).joinpath("llms.txt")
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        markdown = f"# {config.site_name}\n\n"
+
+        if config.site_description is not None:
+            markdown += f"> {config.site_description}\n\n"
+
+        if self.config.markdown_description is not None:
+            markdown += f"{self.config.markdown_description}\n\n"
+
+        full_markdown = markdown
+
+        for section_name, file_list in self.md_pages.items():
+            markdown += f"## {section_name}\n\n"
+            for page_title, path_md, md_url, content in file_list:
+                path_md.write_text(content, encoding="utf8")
+                _logger.debug(f"Generated MD file to {path_md}")
+                markdown += f"- [{page_title}]({md_url})\n"
+            markdown += "\n"
+
+        output_file.write_text(markdown, encoding="utf8")
+        _logger.debug("Generated file /llms.txt")
+
+        if self.config.full_output is not None:
+            full_output_file = Path(config.site_dir).joinpath(self.config.full_output)
+            for section_name, file_list in self.md_pages.items():
+                list_content = "\n".join(info.content for info in file_list)
+                full_markdown += f"# {section_name}\n\n{list_content}"
+            full_output_file.write_text(full_markdown, encoding="utf8")
+            _logger.debug(f"Generated file /{self.config.full_output}.txt")
+
+
+def _language_callback(tag: Tag) -> str:
+    for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()):
+        if css_class.startswith("language-"):
+            return css_class[9:]
+    return ""
+
+
+_converter = MarkdownConverter(
+    bullets="-",
+    code_language_callback=_language_callback,
+    escape_underscores=False,
+    heading_style=ATX,
+)
+
+
+def _generate_page_markdown(
+    html: str,
+    *,
+    should_autoclean: bool,
+    preprocess: str | None,
+    path: str,
+) -> str:
+    """Convert HTML to Markdown.
+
+    Parameters:
+        html: The HTML content.
+        should_autoclean: Whether to autoclean the HTML.
+        preprocess: An optional path of a Python module containing a `preprocess` function.
+        path: The output path of the relevant Markdown file.
+
+    Returns:
+        The Markdown content.
+    """
+    soup = Soup(html, "html.parser")
+    if should_autoclean:
+        autoclean(soup)
+    if preprocess:
+        _preprocess(soup, preprocess, path)
+    return mdformat.text(_converter.convert_soup(soup), options={"wrap": "no"})
diff --git a/src/mkdocs_llmstxt/_internal/preprocess.py b/src/mkdocs_llmstxt/_internal/preprocess.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import html
 import sys
 from importlib.util import module_from_spec, spec_from_file_location
 from typing import TYPE_CHECKING
@@ -98,4 +99,4 @@ def autoclean(soup: Soup) -> None:
 
     # Remove line numbers from code blocks.
     for element in soup.find_all("table", attrs={"class": "highlighttable"}):
-        element.replace_with(Soup(f"<pre>{element.find('code').get_text()}</pre>", "html.parser"))  # type: ignore[union-attr]
+        element.replace_with(Soup(f"<pre>{html.escape(element.find('code').get_text())}</pre>", "html.parser"))  # type: ignore[union-attr]