Skip to content

Commit c2977b3

Browse files
Add local llms.txt file reading (#14)
Add ability to read llms.txt from local files. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
1 parent 1bc11f5 commit c2977b3

File tree

4 files changed

+226
-46
lines changed

4 files changed

+226
-46
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,20 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
2020
#### Choose an `llms.txt` file to use.
2121
* For example, [here's](https://langchain-ai.github.io/langgraph/llms.txt) the LangGraph `llms.txt` file.
2222

23+
> **Note: Security and Domain Access Control**
24+
>
25+
> For security reasons, mcpdoc implements strict domain access controls:
26+
>
27+
> 1. **Remote llms.txt files**: When you specify a remote llms.txt URL (e.g., `https://langchain-ai.github.io/langgraph/llms.txt`), mcpdoc automatically adds only that specific domain (`langchain-ai.github.io`) to the allowed domains list. This means the tool can only fetch documentation from URLs on that domain.
28+
>
29+
> 2. **Local llms.txt files**: When using a local file, NO domains are automatically added to the allowed list. You MUST explicitly specify which domains to allow using the `--allowed-domains` parameter.
30+
>
31+
> 3. **Adding additional domains**: To allow fetching from domains beyond those automatically included:
32+
> - Use `--allowed-domains domain1.com domain2.com` to add specific domains
33+
> - Use `--allowed-domains '*'` to allow all domains (use with caution)
34+
>
35+
> This security measure prevents unauthorized access to domains not explicitly approved by the user, ensuring that documentation can only be retrieved from trusted sources.
36+
2337
#### (Optional) Test the MCP server locally with your `llms.txt` file of choice:
2438
```bash
2539
uvx --from mcpdoc mcpdoc \

mcpdoc/cli.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ class CustomFormatter(
2525
# Directly specifying llms.txt URLs with optional names
2626
mcpdoc --urls LangGraph:https://langchain-ai.github.io/langgraph/llms.txt
2727
28+
# Using a local file (absolute or relative path)
29+
mcpdoc --urls LocalDocs:/path/to/llms.txt --allowed-domains '*'
30+
2831
# Using a YAML config file
2932
mcpdoc --yaml sample_config.yaml
3033
@@ -72,7 +75,7 @@ def parse_args() -> argparse.Namespace:
7275
"-u",
7376
type=str,
7477
nargs="+",
75-
help="List of llms.txt URLs with optional names (format: 'url' or 'name:url')",
78+
help="List of llms.txt URLs or file paths with optional names (format: 'url_or_path' or 'name:url_or_path')",
7679
)
7780

7881
parser.add_argument(
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
8487
"--allowed-domains",
8588
type=str,
8689
nargs="*",
87-
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains",
90+
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains.",
8891
)
8992
parser.add_argument(
9093
"--timeout", type=float, default=10.0, help="HTTP request timeout in seconds"
@@ -163,10 +166,11 @@ def load_config_file(file_path: str, file_format: str) -> List[Dict[str, str]]:
163166

164167

165168
def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
166-
"""Create doc sources from a list of URLs with optional names.
169+
"""Create doc sources from a list of URLs or file paths with optional names.
167170
168171
Args:
169-
urls: List of llms.txt URLs with optional names (format: 'url' or 'name:url')
172+
urls: List of llms.txt URLs or file paths with optional names
173+
(format: 'url_or_path' or 'name:url_or_path')
170174
171175
Returns:
172176
List of DocSource objects

mcpdoc/main.py

Lines changed: 133 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""MCP Llms-txt server for docs."""
22

3+
import os
34
from urllib.parse import urlparse
45

56
import httpx
@@ -34,8 +35,64 @@ def extract_domain(url: str) -> str:
3435
return f"{parsed.scheme}://{parsed.netloc}/"
3536

3637

38+
def _is_http_or_https(url: str) -> bool:
39+
"""Check if the URL is an HTTP or HTTPS URL."""
40+
return url.startswith(("http:", "https:"))
41+
42+
43+
def _get_fetch_description(has_local_sources: bool) -> str:
44+
"""Get fetch docs tool description."""
45+
description = [
46+
"Fetch and parse documentation from a given URL or local file.",
47+
"",
48+
"Use this tool after list_doc_sources to:",
49+
"1. First fetch the llms.txt file from a documentation source",
50+
"2. Analyze the URLs listed in the llms.txt file",
51+
"3. Then fetch specific documentation pages relevant to the user's question",
52+
"",
53+
]
54+
55+
if has_local_sources:
56+
description.extend(
57+
[
58+
"Args:",
59+
" url: The URL or file path to fetch documentation from. Can be:",
60+
" - URL from an allowed domain",
61+
" - A local file path (absolute or relative)",
62+
" - A file:// URL (e.g., file:///path/to/llms.txt)",
63+
]
64+
)
65+
else:
66+
description.extend(
67+
[
68+
"Args:",
69+
" url: The URL to fetch documentation from.",
70+
]
71+
)
72+
73+
description.extend(
74+
[
75+
"",
76+
"Returns:",
77+
" The fetched documentation content converted to markdown, or an error message", # noqa: E501
78+
" if the request fails or the URL is not from an allowed domain.",
79+
]
80+
)
81+
82+
return "\n".join(description)
83+
84+
85+
def _normalize_path(path: str) -> str:
86+
"""Accept paths in file:/// or relative format and map to absolute paths."""
87+
return (
88+
os.path.abspath(path[7:])
89+
if path.startswith("file://")
90+
else os.path.abspath(path)
91+
)
92+
93+
3794
def create_server(
38-
doc_source: list[DocSource],
95+
doc_sources: list[DocSource],
3996
*,
4097
follow_redirects: bool = False,
4198
timeout: float = 10,
@@ -45,7 +102,7 @@ def create_server(
45102
"""Create the server and generate documentation retrieval tools.
46103
47104
Args:
48-
doc_source: List of documentation sources to make available
105+
doc_sources: List of documentation sources to make available
49106
follow_redirects: Whether to follow HTTP redirects when fetching docs
50107
timeout: HTTP request timeout in seconds
51108
settings: Additional settings to pass to FastMCP
@@ -68,61 +125,95 @@ def create_server(
68125
)
69126
httpx_client = httpx.AsyncClient(follow_redirects=follow_redirects, timeout=timeout)
70127

71-
@server.tool()
72-
def list_doc_sources() -> str:
73-
"""List all available documentation sources.
128+
local_sources = []
129+
remote_sources = []
74130

75-
This is the first tool you should call in the documentation workflow.
76-
It provides URLs to llms.txt files that the user has made available.
131+
for entry in doc_sources:
132+
url = entry["llms_txt"]
133+
if _is_http_or_https(url):
134+
remote_sources.append(entry)
135+
else:
136+
local_sources.append(entry)
77137

78-
Returns:
79-
A string containing a formatted list of documentation sources with their URLs
80-
"""
81-
content = ""
82-
for entry in doc_source:
83-
name = entry.get("name", "") or extract_domain(entry["llms_txt"])
84-
content += f"{name}\n"
85-
content += "URL: " + entry["llms_txt"] + "\n\n"
86-
return content
138+
# Let's verify that all local sources exist
139+
for entry in local_sources:
140+
path = entry["llms_txt"]
141+
abs_path = _normalize_path(path)
142+
if not os.path.exists(abs_path):
143+
raise FileNotFoundError(f"Local file not found: {abs_path}")
87144

88-
# Parse the domain names in the llms.txt URLs
89-
domains = set(extract_domain(entry["llms_txt"]) for entry in doc_source)
145+
# Parse the domain names in the llms.txt URLs and identify local file paths
146+
domains = set(extract_domain(entry["llms_txt"]) for entry in remote_sources)
90147

91-
# Add additional allowed domains if specified
148+
# Add additional allowed domains if specified, or set to '*' if we have local files
92149
if allowed_domains:
93150
if "*" in allowed_domains:
94151
domains = {"*"} # Special marker for allowing all domains
95152
else:
96153
domains.update(allowed_domains)
97154

98-
@server.tool()
99-
async def fetch_docs(url: str) -> str:
100-
"""Fetch and parse documentation from a given URL.
155+
allowed_local_files = set(
156+
_normalize_path(entry["llms_txt"]) for entry in local_sources
157+
)
101158

102-
Use this tool after list_doc_sources to:
103-
1. First fetch the llms.txt file from a documentation source
104-
2. Analyze the URLs listed in the llms.txt file
105-
3. Then fetch specific documentation pages relevant to the user's question
159+
@server.tool()
160+
def list_doc_sources() -> str:
161+
"""List all available documentation sources.
106162
107-
Args:
108-
url: The URL to fetch documentation from. Must be from an allowed domain.
163+
This is the first tool you should call in the documentation workflow.
164+
It provides URLs to llms.txt files or local file paths that the user has made available.
109165
110166
Returns:
111-
The fetched documentation content converted to markdown, or an error message
112-
if the request fails or the URL is not from an allowed domain.
167+
A string containing a formatted list of documentation sources with their URLs or file paths
113168
"""
169+
content = ""
170+
for entry_ in doc_sources:
171+
url_or_path = entry_["llms_txt"]
172+
173+
if _is_http_or_https(url_or_path):
174+
name = entry_.get("name", extract_domain(url_or_path))
175+
content += f"{name}\nURL: {url_or_path}\n\n"
176+
else:
177+
path = _normalize_path(url_or_path)
178+
name = entry_.get("name", path)
179+
content += f"{name}\nPath: {path}\n\n"
180+
return content
181+
182+
fetch_docs_description = _get_fetch_description(
183+
has_local_sources=bool(local_sources)
184+
)
185+
186+
@server.tool(description=fetch_docs_description)
187+
async def fetch_docs(url: str) -> str:
114188
nonlocal domains
115-
if "*" not in domains and not any(url.startswith(domain) for domain in domains):
116-
return (
117-
"Error: URL not allowed. Must start with one of the following domains: "
118-
+ ", ".join(domains)
119-
)
120-
121-
try:
122-
response = await httpx_client.get(url, timeout=timeout)
123-
response.raise_for_status()
124-
return markdownify(response.text)
125-
except (httpx.HTTPStatusError, httpx.RequestError) as e:
126-
return f"Encountered an HTTP error with code {e.response.status_code}"
189+
# Handle local file paths (either as file:// URLs or direct filesystem paths)
190+
if not _is_http_or_https(url):
191+
abs_path = _normalize_path(url)
192+
if abs_path not in allowed_local_files:
193+
raise ValueError(
194+
f"Local file not allowed: {abs_path}. Allowed files: {allowed_local_files}"
195+
)
196+
try:
197+
with open(abs_path, "r", encoding="utf-8") as f:
198+
content = f.read()
199+
return markdownify(content)
200+
except Exception as e:
201+
return f"Error reading local file: {str(e)}"
202+
else:
203+
# Otherwise treat as URL
204+
if "*" not in domains and not any(
205+
url.startswith(domain) for domain in domains
206+
):
207+
return (
208+
"Error: URL not allowed. Must start with one of the following domains: "
209+
+ ", ".join(domains)
210+
)
211+
212+
try:
213+
response = await httpx_client.get(url, timeout=timeout)
214+
response.raise_for_status()
215+
return markdownify(response.text)
216+
except (httpx.HTTPStatusError, httpx.RequestError) as e:
217+
return f"Encountered an HTTP error: {str(e)}"
127218

128219
return server

tests/unit_tests/test_main.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Tests for mcpdoc.main module."""
2+
3+
import pytest
4+
5+
from mcpdoc.main import (
6+
_get_fetch_description,
7+
_is_http_or_https,
8+
extract_domain,
9+
)
10+
11+
12+
def test_extract_domain() -> None:
13+
"""Test extract_domain function."""
14+
# Test with https URL
15+
assert extract_domain("https://example.com/page") == "https://example.com/"
16+
17+
# Test with http URL
18+
assert extract_domain("http://test.org/docs/index.html") == "http://test.org/"
19+
20+
# Test with URL that has port
21+
assert extract_domain("https://localhost:8080/api") == "https://localhost:8080/"
22+
23+
# Check trailing slash
24+
assert extract_domain("https://localhost:8080") == "https://localhost:8080/"
25+
26+
# Test with URL that has subdomain
27+
assert extract_domain("https://docs.python.org/3/") == "https://docs.python.org/"
28+
29+
30+
@pytest.mark.parametrize(
31+
"url,expected",
32+
[
33+
("http://example.com", True),
34+
("https://example.com", True),
35+
("/path/to/file.txt", False),
36+
("file:///path/to/file.txt", False),
37+
(
38+
"ftp://example.com",
39+
False,
40+
), # Not HTTP or HTTPS, even though it's not a local file
41+
],
42+
)
43+
def test_is_http_or_https(url, expected):
44+
"""Test _is_http_or_https function."""
45+
assert _is_http_or_https(url) is expected
46+
47+
48+
@pytest.mark.parametrize(
49+
"has_local_sources,expected_substrings",
50+
[
51+
(True, ["local file path", "file://"]),
52+
(False, ["URL to fetch"]),
53+
],
54+
)
55+
def test_get_fetch_description(has_local_sources, expected_substrings):
56+
"""Test _get_fetch_description function."""
57+
description = _get_fetch_description(has_local_sources)
58+
59+
# Common assertions for both cases
60+
assert "Fetch and parse documentation" in description
61+
assert "Returns:" in description
62+
63+
# Specific assertions based on has_local_sources
64+
for substring in expected_substrings:
65+
if has_local_sources:
66+
assert substring in description
67+
else:
68+
# For the False case, we only check that "local file path"
69+
# and "file://" are NOT present
70+
if substring in ["local file path", "file://"]:
71+
assert substring not in description

0 commit comments

Comments
 (0)