Add local llms.txt file reading (#14)

rlancemartin · eyurtsev · web-flow · commit c2977b360287 · 2025-03-27T10:22:42.000-07:00
Add ability to read llms.txt from local files.

---------

Co-authored-by: Eugene Yurtsev &lt;eyurtsev@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -20,6 +20,20 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 #### Choose an `llms.txt` file to use. 
 * For example, [here's](https://langchain-ai.github.io/langgraph/llms.txt) the LangGraph `llms.txt` file.
 
+> **Note: Security and Domain Access Control**
+> 
+> For security reasons, mcpdoc implements strict domain access controls:
+> 
+> 1. **Remote llms.txt files**: When you specify a remote llms.txt URL (e.g., `https://langchain-ai.github.io/langgraph/llms.txt`), mcpdoc automatically adds only that specific domain (`langchain-ai.github.io`) to the allowed domains list. This means the tool can only fetch documentation from URLs on that domain.
+> 
+> 2. **Local llms.txt files**: When using a local file, NO domains are automatically added to the allowed list. You MUST explicitly specify which domains to allow using the `--allowed-domains` parameter.
+> 
+> 3. **Adding additional domains**: To allow fetching from domains beyond those automatically included:
+>    - Use `--allowed-domains domain1.com domain2.com` to add specific domains
+>    - Use `--allowed-domains '*'` to allow all domains (use with caution)
+> 
+> This security measure prevents unauthorized access to domains not explicitly approved by the user, ensuring that documentation can only be retrieved from trusted sources.
+
 #### (Optional) Test the MCP server locally with your `llms.txt` file of choice:
 ```bash
 uvx --from mcpdoc mcpdoc \
diff --git a/mcpdoc/cli.py b/mcpdoc/cli.py
@@ -25,6 +25,9 @@ class CustomFormatter(
   # Directly specifying llms.txt URLs with optional names
   mcpdoc --urls LangGraph:https://langchain-ai.github.io/langgraph/llms.txt
   
+  # Using a local file (absolute or relative path)
+  mcpdoc --urls LocalDocs:/path/to/llms.txt --allowed-domains '*'
+  
   # Using a YAML config file
   mcpdoc --yaml sample_config.yaml
 
@@ -72,7 +75,7 @@ def parse_args() -> argparse.Namespace:
         "-u",
         type=str,
         nargs="+",
-        help="List of llms.txt URLs with optional names (format: 'url' or 'name:url')",
+        help="List of llms.txt URLs or file paths with optional names (format: 'url_or_path' or 'name:url_or_path')",
     )
 
     parser.add_argument(
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
         "--allowed-domains",
         type=str,
         nargs="*",
-        help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains",
+        help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains.",
     )
     parser.add_argument(
         "--timeout", type=float, default=10.0, help="HTTP request timeout in seconds"
@@ -163,10 +166,11 @@ def load_config_file(file_path: str, file_format: str) -> List[Dict[str, str]]:
 
 
 def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
-    """Create doc sources from a list of URLs with optional names.
+    """Create doc sources from a list of URLs or file paths with optional names.
 
     Args:
-        urls: List of llms.txt URLs with optional names (format: 'url' or 'name:url')
+        urls: List of llms.txt URLs or file paths with optional names
+             (format: 'url_or_path' or 'name:url_or_path')
 
     Returns:
         List of DocSource objects
diff --git a/mcpdoc/main.py b/mcpdoc/main.py
@@ -1,5 +1,6 @@
 """MCP Llms-txt server for docs."""
 
+import os
 from urllib.parse import urlparse
 
 import httpx
@@ -34,8 +35,64 @@ def extract_domain(url: str) -> str:
     return f"{parsed.scheme}://{parsed.netloc}/"
 
 
+def _is_http_or_https(url: str) -> bool:
+    """Check if the URL is an HTTP or HTTPS URL."""
+    return url.startswith(("http:", "https:"))
+
+
+def _get_fetch_description(has_local_sources: bool) -> str:
+    """Get fetch docs tool description."""
+    description = [
+        "Fetch and parse documentation from a given URL or local file.",
+        "",
+        "Use this tool after list_doc_sources to:",
+        "1. First fetch the llms.txt file from a documentation source",
+        "2. Analyze the URLs listed in the llms.txt file",
+        "3. Then fetch specific documentation pages relevant to the user's question",
+        "",
+    ]
+
+    if has_local_sources:
+        description.extend(
+            [
+                "Args:",
+                "    url: The URL or file path to fetch documentation from. Can be:",
+                "        - URL from an allowed domain",
+                "        - A local file path (absolute or relative)",
+                "        - A file:// URL (e.g., file:///path/to/llms.txt)",
+            ]
+        )
+    else:
+        description.extend(
+            [
+                "Args:",
+                "    url: The URL to fetch documentation from.",
+            ]
+        )
+
+    description.extend(
+        [
+            "",
+            "Returns:",
+            "    The fetched documentation content converted to markdown, or an error message",  # noqa: E501
+            "    if the request fails or the URL is not from an allowed domain.",
+        ]
+    )
+
+    return "\n".join(description)
+
+
+def _normalize_path(path: str) -> str:
+    """Accept paths in file:/// or relative format and map to absolute paths."""
+    return (
+        os.path.abspath(path[7:])
+        if path.startswith("file://")
+        else os.path.abspath(path)
+    )
+
+
 def create_server(
-    doc_source: list[DocSource],
+    doc_sources: list[DocSource],
     *,
     follow_redirects: bool = False,
     timeout: float = 10,
@@ -45,7 +102,7 @@ def create_server(
     """Create the server and generate documentation retrieval tools.
 
     Args:
-        doc_source: List of documentation sources to make available
+        doc_sources: List of documentation sources to make available
         follow_redirects: Whether to follow HTTP redirects when fetching docs
         timeout: HTTP request timeout in seconds
         settings: Additional settings to pass to FastMCP
@@ -68,61 +125,95 @@ def create_server(
     )
     httpx_client = httpx.AsyncClient(follow_redirects=follow_redirects, timeout=timeout)
 
-    @server.tool()
-    def list_doc_sources() -> str:
-        """List all available documentation sources.
+    local_sources = []
+    remote_sources = []
 
-        This is the first tool you should call in the documentation workflow.
-        It provides URLs to llms.txt files that the user has made available.
+    for entry in doc_sources:
+        url = entry["llms_txt"]
+        if _is_http_or_https(url):
+            remote_sources.append(entry)
+        else:
+            local_sources.append(entry)
 
-        Returns:
-            A string containing a formatted list of documentation sources with their URLs
-        """
-        content = ""
-        for entry in doc_source:
-            name = entry.get("name", "") or extract_domain(entry["llms_txt"])
-            content += f"{name}\n"
-            content += "URL: " + entry["llms_txt"] + "\n\n"
-        return content
+    # Let's verify that all local sources exist
+    for entry in local_sources:
+        path = entry["llms_txt"]
+        abs_path = _normalize_path(path)
+        if not os.path.exists(abs_path):
+            raise FileNotFoundError(f"Local file not found: {abs_path}")
 
-    # Parse the domain names in the llms.txt URLs
-    domains = set(extract_domain(entry["llms_txt"]) for entry in doc_source)
+    # Parse the domain names in the llms.txt URLs and identify local file paths
+    domains = set(extract_domain(entry["llms_txt"]) for entry in remote_sources)
 
-    # Add additional allowed domains if specified
+    # Add additional allowed domains if specified, or set to '*' if we have local files
     if allowed_domains:
         if "*" in allowed_domains:
             domains = {"*"}  # Special marker for allowing all domains
         else:
             domains.update(allowed_domains)
 
-    @server.tool()
-    async def fetch_docs(url: str) -> str:
-        """Fetch and parse documentation from a given URL.
+    allowed_local_files = set(
+        _normalize_path(entry["llms_txt"]) for entry in local_sources
+    )
 
-        Use this tool after list_doc_sources to:
-        1. First fetch the llms.txt file from a documentation source
-        2. Analyze the URLs listed in the llms.txt file
-        3. Then fetch specific documentation pages relevant to the user's question
+    @server.tool()
+    def list_doc_sources() -> str:
+        """List all available documentation sources.
 
-        Args:
-            url: The URL to fetch documentation from. Must be from an allowed domain.
+        This is the first tool you should call in the documentation workflow.
+        It provides URLs to llms.txt files or local file paths that the user has made available.
 
         Returns:
-            The fetched documentation content converted to markdown, or an error message
-            if the request fails or the URL is not from an allowed domain.
+            A string containing a formatted list of documentation sources with their URLs or file paths
         """
+        content = ""
+        for entry_ in doc_sources:
+            url_or_path = entry_["llms_txt"]
+
+            if _is_http_or_https(url_or_path):
+                name = entry_.get("name", extract_domain(url_or_path))
+                content += f"{name}\nURL: {url_or_path}\n\n"
+            else:
+                path = _normalize_path(url_or_path)
+                name = entry_.get("name", path)
+                content += f"{name}\nPath: {path}\n\n"
+        return content
+
+    fetch_docs_description = _get_fetch_description(
+        has_local_sources=bool(local_sources)
+    )
+
+    @server.tool(description=fetch_docs_description)
+    async def fetch_docs(url: str) -> str:
         nonlocal domains
-        if "*" not in domains and not any(url.startswith(domain) for domain in domains):
-            return (
-                "Error: URL not allowed. Must start with one of the following domains: "
-                + ", ".join(domains)
-            )
-
-        try:
-            response = await httpx_client.get(url, timeout=timeout)
-            response.raise_for_status()
-            return markdownify(response.text)
-        except (httpx.HTTPStatusError, httpx.RequestError) as e:
-            return f"Encountered an HTTP error with code {e.response.status_code}"
+        # Handle local file paths (either as file:// URLs or direct filesystem paths)
+        if not _is_http_or_https(url):
+            abs_path = _normalize_path(url)
+            if abs_path not in allowed_local_files:
+                raise ValueError(
+                    f"Local file not allowed: {abs_path}. Allowed files: {allowed_local_files}"
+                )
+            try:
+                with open(abs_path, "r", encoding="utf-8") as f:
+                    content = f.read()
+                return markdownify(content)
+            except Exception as e:
+                return f"Error reading local file: {str(e)}"
+        else:
+            # Otherwise treat as URL
+            if "*" not in domains and not any(
+                url.startswith(domain) for domain in domains
+            ):
+                return (
+                    "Error: URL not allowed. Must start with one of the following domains: "
+                    + ", ".join(domains)
+                )
+
+            try:
+                response = await httpx_client.get(url, timeout=timeout)
+                response.raise_for_status()
+                return markdownify(response.text)
+            except (httpx.HTTPStatusError, httpx.RequestError) as e:
+                return f"Encountered an HTTP error: {str(e)}"
 
     return server
diff --git a/tests/unit_tests/test_main.py b/tests/unit_tests/test_main.py
@@ -0,0 +1,71 @@
+"""Tests for mcpdoc.main module."""
+
+import pytest
+
+from mcpdoc.main import (
+    _get_fetch_description,
+    _is_http_or_https,
+    extract_domain,
+)
+
+
+def test_extract_domain() -> None:
+    """Test extract_domain function."""
+    # Test with https URL
+    assert extract_domain("https://example.com/page") == "https://example.com/"
+
+    # Test with http URL
+    assert extract_domain("http://test.org/docs/index.html") == "http://test.org/"
+
+    # Test with URL that has port
+    assert extract_domain("https://localhost:8080/api") == "https://localhost:8080/"
+
+    # Check trailing slash
+    assert extract_domain("https://localhost:8080") == "https://localhost:8080/"
+
+    # Test with URL that has subdomain
+    assert extract_domain("https://docs.python.org/3/") == "https://docs.python.org/"
+
+
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        ("http://example.com", True),
+        ("https://example.com", True),
+        ("/path/to/file.txt", False),
+        ("file:///path/to/file.txt", False),
+        (
+            "ftp://example.com",
+            False,
+        ),  # Not HTTP or HTTPS, even though it's not a local file
+    ],
+)
+def test_is_http_or_https(url, expected):
+    """Test _is_http_or_https function."""
+    assert _is_http_or_https(url) is expected
+
+
+@pytest.mark.parametrize(
+    "has_local_sources,expected_substrings",
+    [
+        (True, ["local file path", "file://"]),
+        (False, ["URL to fetch"]),
+    ],
+)
+def test_get_fetch_description(has_local_sources, expected_substrings):
+    """Test _get_fetch_description function."""
+    description = _get_fetch_description(has_local_sources)
+
+    # Common assertions for both cases
+    assert "Fetch and parse documentation" in description
+    assert "Returns:" in description
+
+    # Specific assertions based on has_local_sources
+    for substring in expected_substrings:
+        if has_local_sources:
+            assert substring in description
+        else:
+            # For the False case, we only check that "local file path"
+            # and "file://" are NOT present
+            if substring in ["local file path", "file://"]:
+                assert substring not in description