From 63115145ff7374cf77c9f638d93e24d4779db030 Mon Sep 17 00:00:00 2001 From: elseml <60779710+elseml@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:37:57 +0200 Subject: [PATCH 01/12] Add initial llm context drafts --- .github/workflows/build-llm-context.yaml | 40 +++ llm_context/README.md | 24 ++ llm_context/build_llm_context.py | 349 +++++++++++++++++++++++ llm_context/requirements.txt | 3 + 4 files changed, 416 insertions(+) create mode 100644 .github/workflows/build-llm-context.yaml create mode 100644 llm_context/README.md create mode 100644 llm_context/build_llm_context.py create mode 100644 llm_context/requirements.txt diff --git a/.github/workflows/build-llm-context.yaml b/.github/workflows/build-llm-context.yaml new file mode 100644 index 000000000..76a3663c0 --- /dev/null +++ b/.github/workflows/build-llm-context.yaml @@ -0,0 +1,40 @@ +name: Build BayesFlow LLM Context (full + compact) + +on: + workflow_dispatch: + release: + types: [published] + +permissions: + contents: write + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r llm_context/requirements.txt + + - name: Build artifacts + run: | + python llm_context/build_llm_context.py --tag "${{ github.event.release.tag_name }}" + + - name: Upload artifacts to release + uses: softprops/action-gh-release@v2 + with: + files: | + llm_context/bayesflow-context-full-${{ github.event.release.tag_name }}.md + llm_context/bayesflow-context-compact-${{ github.event.release.tag_name }}.md + llm_context/bayesflow-context-${{ github.event.release.tag_name }}.manifest.json diff --git a/llm_context/README.md b/llm_context/README.md new file mode 100644 index 000000000..e5993d565 --- /dev/null +++ b/llm_context/README.md @@ -0,0 +1,24 @@ +# BayesFlow LLM Context + +This folder contains single-file context artifacts to improve LLM assistance for BayesFlow. + +## Files +- `bayesflow-context-full-.md` + Full Markdown snapshot: README, examples (converted to Markdown), and all `bayesflow/` code. + +- `bayesflow-context-compact-.md` + Smaller snapshot: README + examples fully, `bayesflow/` code partially (truncated previews). + +- `bayesflow-context-.manifest.json` + Metadata (tag, commit, dependencies, file sizes). + +## Usage +1. Download either the full or compact file for the release tag of interest: The compact file is cheaper and faster; the full file is most accurate. +2. Paste it into your LLM context before asking questions about BayesFlow. + +## Prompt Tip +You are answering questions about BayesFlow using only the provided context .md file. If using code, cite the file or notebook name shown in the context. +QUESTION: + +## Disclaimer +The context files are generated automatically and may be outdated or incomplete. While they aim at improving LLM accuracy, hallucinations may still occur frequently during LLM assistance. Please always refer to the official BayesFlow documentation and codebase for the most accurate information. diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py new file mode 100644 index 000000000..ef366337e --- /dev/null +++ b/llm_context/build_llm_context.py @@ -0,0 +1,349 @@ +""" +Build BayesFlow LLM context files (full + compact). + +Artifacts written to llm_context/: +- bayesflow-context-full-.md +- bayesflow-context-compact-.md +- bayesflow-context-.manifest.json + +Strategy: +- Convert notebooks in examples/ to Markdown (temporary, not committed). +- Run repomix on bayesflow/, examples/, README.md. +- Compact file: README + examples fully, bayesflow/ truncated unless short. +- Both files include a short dependency summary from pyproject.toml. +""" + +from __future__ import annotations +import argparse +import datetime +import json +import logging +import os +import subprocess +import tempfile +from pathlib import Path +import re +import nbformat +from typing import List, Optional, Tuple + +# Configuration +ROOT = Path(".").resolve() +OUT_DIR = Path("llm_context") +INCLUDE_FOLDERS = ("bayesflow/",) +INCLUDE_FILES = ("README.md",) +PYPROJECT = Path("pyproject.toml") +HEADING_RE = re.compile(r"^\s#{2,}\s*(?:FILE:\s*)?(?P.+?)\s*$", flags=re.MULTILINE) +TOKEN_CHAR_RATIO = 4 + +logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + +# Utilities +def run(cmd: List[str], input_text: Optional[str] = None) -> str: + """ + Run a shell command and capture stdout. + + Parameters + ---------- + cmd : list of str + Command and arguments. + input_text : str, optional + Text passed to stdin. + + Returns + ------- + str + Captured stdout. + + Raises + ------ + RuntimeError + If the command exits with a non-zero status. + """ + res = subprocess.run(cmd, check=False, text=True, input=input_text, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if res.returncode != 0: + raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{res.stderr.strip()}") + return res.stdout + + +def token_estimate(text: str) -> int: + """ + Roughly estimate token count for text. + + Parameters + ---------- + text : str + Input text. + + Returns + ------- + int + Estimated token count. + """ + return max(1, len(text) // TOKEN_CHAR_RATIO) + + +def load_dependency_summary(pyproject: Path = PYPROJECT) -> List[str]: + """ + Extract dependencies from pyproject.toml. + + Parameters + ---------- + pyproject : Path + Path to pyproject.toml. + + Returns + ------- + list of str + Dependency strings, or empty list if not available. + """ + if not pyproject.exists(): + return [] + try: + import tomllib as _toml # Python 3.11+ + except Exception: + import tomli as _toml # Fallback + try: + data = _toml.loads(pyproject.read_text(encoding="utf8")) + except Exception: + return [] + proj = data.get("project", {}) or {} + raw = proj.get("dependencies", []) or [] + return [d.split(";")[0].strip() for d in raw if isinstance(d, str)] + + +# Notebook conversion +def notebook_to_md(nb_path: Path) -> str: + """ + Convert Jupyter notebook to Markdown. + + Parameters + ---------- + nb_path : Path + Path to .ipynb file. + + Returns + ------- + str + Markdown text with markdown and code cells. + """ + nb = nbformat.read(str(nb_path), as_version=4) + out: List[str] = [f"# Notebook: {nb_path.name}", ""] + for cell in nb.cells: + src = "".join(cell.get("source", "")) if isinstance(cell.get("source", ""), list) else cell.get("source", "") + if cell.get("cell_type") == "markdown": + out.append(src.rstrip()) + out.append("") + elif cell.get("cell_type") == "code" and src.strip(): + out.extend(["```python", src.strip("\n"), "```", ""]) + return "\n".join(out).rstrip() + "\n" + + +def convert_examples_to_md(src: Path, out: Path) -> List[Path]: + """ + Convert all .ipynb notebooks in a directory tree to Markdown. + + Parameters + ---------- + src : Path + Source examples/ directory. + out : Path + Destination directory for converted .md files. + + Returns + ------- + list of Path + List of generated Markdown file paths. + """ + created: List[Path] = [] + if not src.exists(): + return created + out.mkdir(parents=True, exist_ok=True) + for nb in sorted(src.rglob("*.ipynb")): + try: + dst = out / (nb.stem + ".md") + dst.write_text(notebook_to_md(nb), encoding="utf8") + created.append(dst) + logging.info("Converted %s -> %s", nb, dst) + except Exception as e: + logging.warning("Failed to convert %s: %s", nb, e) + return created + + +# Context generation +def run_repomix_on_paths(paths: List[str], style: str = "markdown") -> str: + """ + Run repomix on given paths. + + Parameters + ---------- + paths : list of str + Relative paths to include. + style : str + Output style, default 'markdown'. + + Returns + ------- + str + Repomix output. + """ + cmd = ["repomix", "--style", style, "--stdin", "--stdout"] + return run(cmd, input_text="\n".join(paths) + "\n") + + +def generate_compact(full_text: str, tag: str, repo_root: Path, conv_examples_dir: Path) -> str: + """ + Create compact context file from full repomix output. + + Parameters + ---------- + full_text : str + Full repomix output. + tag : str + Release tag. + repo_root : Path + Repository root. + conv_examples_dir : Path + Path to temporary converted examples. + + Returns + ------- + str + Compact context content. + """ + lines = full_text.splitlines(keepends=True) + sections: List[Tuple[str, int, int]] = [] + cur_path: Optional[str] = None + cur_start = 0 + for i, line in enumerate(lines): + m = HEADING_RE.match(line) + if m: + if cur_path is not None: + sections.append((cur_path, cur_start, i)) + cur_path = m.group("path").strip() + cur_start = i + 1 + if cur_path is not None: + sections.append((cur_path, cur_start, len(lines))) + + if not sections: + return f"\n\n{''.join(lines[:40])}" + + out_lines: List[str] = [f"\n\n"] + preview_lines = 40 + max_keep_tokens = 1200 + for idx, (path, s, e) in enumerate(sections, start=1): + seg = "".join(lines[s:e]) + path_lower = path.lower() + keep_full = ( + path_lower.endswith("readme.md") + or path_lower.startswith("examples") + or (conv_examples_dir.exists() and (conv_examples_dir / Path(path).name).exists()) + or (path.startswith("bayesflow") and token_estimate(seg) <= max_keep_tokens) + ) + out_lines.append(f"## {path} \n\n") + if keep_full: + out_lines.append(seg + "\n") + else: + out_lines.append("".join(seg.splitlines(keepends=True)[:preview_lines])) + out_lines.append(f"\n> [TRUNCATED] See full file for `{path}` lines {s + 1}-{e}.\n\n") + return "".join(out_lines) + + +# Build pipeline +def build(tag: Optional[str], out_dir: Path): + """ + Generate full + compact context files and manifest. + + Parameters + ---------- + tag : str or None + Release tag. If None, inferred from environment or commit hash. + out_dir : Path + Destination directory. + + Returns + ------- + tuple of Path + (full_file, compact_file, manifest_file) + """ + out_dir.mkdir(parents=True, exist_ok=True) + deps = load_dependency_summary(PYPROJECT) + dep_md = "**Dependency summary:**\n" + "\n".join(f"- {d}" for d in deps) + "\n\n" if deps else "" + + with tempfile.TemporaryDirectory(prefix="bf-conv-") as tmp: + tmp_path = Path(tmp) + convert_examples_to_md(ROOT / "examples", tmp_path) + repomix_inputs = [str(p) for p in INCLUDE_FOLDERS if (ROOT / p).exists()] + if tmp_path.exists(): + repomix_inputs.append(str(tmp_path)) + for f in INCLUDE_FILES: + if (ROOT / f).exists(): + repomix_inputs.append(f) + repomix_out = run_repomix_on_paths(repomix_inputs, style="markdown") + + try: + commit = run(["git", "rev-parse", "HEAD"]).strip() + except Exception: + commit = None + + tag = ( + tag + or os.environ.get("RELEASE_TAG") + or (commit[:7] if commit else datetime.datetime.utcnow().strftime("%Y%m%d")) + ) + header = { + "artifact": f"bayesflow-context-full-{tag}.md", + "tag": tag, + "commit": commit, + "generated_at": datetime.datetime.utcnow().isoformat() + "Z", + } + header_block = ["---"] + [f"{k}: {v}" for k, v in header.items() if v] + ["---", ""] + + full_text = "\n".join(header_block) + dep_md + repomix_out + full_path = out_dir / f"bayesflow-context-full-{tag}.md" + full_path.write_text(full_text, encoding="utf8") + + compact_text = generate_compact(repomix_out, tag, ROOT, tmp_path) + compact_path = out_dir / f"bayesflow-context-compact-{tag}.md" + compact_path.write_text("\n".join(header_block) + dep_md + compact_text, encoding="utf8") + + manifest = { + "tag": tag, + "commit": commit, + "generated_at": header["generated_at"], + "dependency_summary": deps, + "files": { + full_path.name: {"size_bytes": full_path.stat().st_size}, + compact_path.name: {"size_bytes": compact_path.stat().st_size}, + }, + } + manifest_path = out_dir / f"bayesflow-context-{tag}.manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf8") + + logging.info("Built artifacts: %s, %s, %s", full_path, compact_path, manifest_path) + return full_path, compact_path, manifest_path + + +def main(argv=None): + """ + CLI entrypoint. + + Parameters + ---------- + argv : list of str, optional + Command-line arguments. + + Returns + ------- + int + Exit status code. + """ + parser = argparse.ArgumentParser(description="Build BayesFlow LLM context (full + compact).") + parser.add_argument("--tag", type=str, default=None) + args = parser.parse_args(argv) + build(args.tag, OUT_DIR) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/llm_context/requirements.txt b/llm_context/requirements.txt new file mode 100644 index 000000000..91799b7c0 --- /dev/null +++ b/llm_context/requirements.txt @@ -0,0 +1,3 @@ +repomix +nbformat +tomli From 0694609b630d1d2cddc77f47ff6d4bda0f02d762 Mon Sep 17 00:00:00 2001 From: elseml Date: Fri, 12 Sep 2025 10:59:20 +0200 Subject: [PATCH 02/12] Add llm context minimal working version --- llm_context/build_llm_context.py | 492 ++++++++++--------------------- 1 file changed, 157 insertions(+), 335 deletions(-) diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index ef366337e..2d0a6f62f 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -1,349 +1,171 @@ +#!/usr/bin/env python3 """ -Build BayesFlow LLM context files (full + compact). +Build two repomix LLM-context files, but write converted .md files into a temporary directory +so the real examples/ folder is never modified. -Artifacts written to llm_context/: -- bayesflow-context-full-.md -- bayesflow-context-compact-.md -- bayesflow-context-.manifest.json - -Strategy: -- Convert notebooks in examples/ to Markdown (temporary, not committed). -- Run repomix on bayesflow/, examples/, README.md. -- Compact file: README + examples fully, bayesflow/ truncated unless short. -- Both files include a short dependency summary from pyproject.toml. + - llm_context/llm_context_compact.md -> examples only (from temp dir) + - llm_context/llm_context_full.md -> examples (temp dir) + bayesflow source code """ - -from __future__ import annotations -import argparse -import datetime import json -import logging -import os import subprocess -import tempfile from pathlib import Path -import re -import nbformat -from typing import List, Optional, Tuple - -# Configuration -ROOT = Path(".").resolve() -OUT_DIR = Path("llm_context") -INCLUDE_FOLDERS = ("bayesflow/",) -INCLUDE_FILES = ("README.md",) -PYPROJECT = Path("pyproject.toml") -HEADING_RE = re.compile(r"^\s#{2,}\s*(?:FILE:\s*)?(?P.+?)\s*$", flags=re.MULTILINE) -TOKEN_CHAR_RATIO = 4 - -logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") - - -# Utilities -def run(cmd: List[str], input_text: Optional[str] = None) -> str: - """ - Run a shell command and capture stdout. - - Parameters - ---------- - cmd : list of str - Command and arguments. - input_text : str, optional - Text passed to stdin. - - Returns - ------- - str - Captured stdout. - - Raises - ------ - RuntimeError - If the command exits with a non-zero status. - """ - res = subprocess.run(cmd, check=False, text=True, input=input_text, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if res.returncode != 0: - raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{res.stderr.strip()}") - return res.stdout - - -def token_estimate(text: str) -> int: - """ - Roughly estimate token count for text. - - Parameters - ---------- - text : str - Input text. - - Returns - ------- - int - Estimated token count. - """ - return max(1, len(text) // TOKEN_CHAR_RATIO) - - -def load_dependency_summary(pyproject: Path = PYPROJECT) -> List[str]: - """ - Extract dependencies from pyproject.toml. - - Parameters - ---------- - pyproject : Path - Path to pyproject.toml. - - Returns - ------- - list of str - Dependency strings, or empty list if not available. - """ - if not pyproject.exists(): - return [] - try: - import tomllib as _toml # Python 3.11+ - except Exception: - import tomli as _toml # Fallback +import tempfile +import sys +import shutil + +base_dir = Path(__file__).parent.parent.resolve() +print("base_dir:", base_dir) + +examples_dir = base_dir / "examples" +src_dir = base_dir / "bayesflow" +readme_file = base_dir / "README.md" +output_dir = base_dir / "llm_context" +compact_output_file = output_dir / "llm_context_compact.md" +full_output_file = output_dir / "llm_context_full.md" + +# Ensure output directory exists +output_dir.mkdir(parents=True, exist_ok=True) + +# Safety checks +if not examples_dir.exists(): + print(f"ERROR: examples directory not found: {examples_dir}", file=sys.stderr) + raise SystemExit(1) +if not src_dir.exists(): + print(f"WARNING: bayesflow source directory not found: {src_dir} -- full context will be skipped.", file=sys.stderr) + +def convert_notebooks_to_md_in_temp(src_examples_dir: Path, temp_examples_dir: Path): + """ + Convert .ipynb files to .md and write them into temp_examples_dir. + Returns: + - list of Path objects (absolute) to the markdown files created (for repomix input) + - list of actual file paths created (for cleanup) + """ + created_paths = [] + md_paths = [] + + for ipynb_file in sorted(src_examples_dir.glob("*.ipynb")): + with open(ipynb_file, "r", encoding="utf-8") as f: + notebook = json.load(f) + + parts = [] + for cell in notebook.get("cells", []): + if cell.get("cell_type") == "markdown": + parts.append("".join(cell.get("source", []))) + elif cell.get("cell_type") == "code": + parts.append("```python\n" + "".join(cell.get("source", [])) + "\n```") + + # write into the temporary examples directory (never into the real examples/) + md_file = temp_examples_dir / f"{ipynb_file.stem}.md" + + # ensure unique name just in case (temp dir typically empty, but keep behaviour consistent) + if md_file.exists(): + i = 1 + while True: + candidate = temp_examples_dir / f"{ipynb_file.stem}.repomix.{i}.md" + if not candidate.exists(): + md_file = candidate + break + i += 1 + + with open(md_file, "w", encoding="utf-8") as f: + f.write("\n\n".join(parts)) + + created_paths.append(md_file) + md_paths.append(md_file.resolve()) + + print("Created temporary md:", md_file) + + return md_paths, created_paths + +def collect_bayesflow_py_abs_paths(src_bayesflow_dir: Path): + """Return a sorted list of absolute Paths for all .py files in src_bayesflow_dir.""" + return sorted(p.resolve() for p in sorted(src_bayesflow_dir.rglob("*.py"))) + +def run_repomix_with_file_list(file_paths, output_path, repo_cwd, include_patterns="**/*.py,**/*.md"): + """Run repomix (cwd=repo_cwd) with --stdin reading newline-separated paths (absolute or relative).""" + if not file_paths: + print(f"No files provided for repomix output {output_path}. Skipping.", file=sys.stderr) + return + + cmd = [ + "repomix", + "--style", "markdown", + "--stdin", + "--include", include_patterns, + "--ignore", "bayesflow/experimental/", + "-o", str(output_path), + ] + print(f"Running repomix in cwd={repo_cwd}: {' '.join(cmd)}") + print(f" -> {len(file_paths)} files (showing up to 20):") + for p in file_paths[:20]: + print(" ", str(p)) + + stdin_input = "\n".join(str(p) for p in file_paths) + "\n" + subprocess.run(cmd, input=stdin_input, text=True, check=True, cwd=str(repo_cwd)) + print(f"✅ Repomix packaged output saved to {output_path}") + +# --- Main flow --- +# Create a temporary examples directory *under the repo root* so repomix can use relative paths if it wants. +temp_dir_path = None +created_files = [] + +try: + temp_dir = tempfile.mkdtemp(prefix=".examples_temporary_", dir=str(base_dir)) + temp_examples_dir = Path(temp_dir) + temp_dir_path = temp_examples_dir + print("Using temporary examples dir:", temp_examples_dir) + + # Convert notebooks into the temp folder (no changes in the real examples/ directory) + md_abs_paths, created_files = convert_notebooks_to_md_in_temp(examples_dir, temp_examples_dir) + if not md_abs_paths: + print("ERROR: No example notebooks (*.ipynb) found or conversion produced no markdown files.", file=sys.stderr) + raise SystemExit(1) + + # For repomix we can pass relative paths (relative to repo root) — convert if possible try: - data = _toml.loads(pyproject.read_text(encoding="utf8")) + md_rel_for_repomix = [p.relative_to(base_dir) for p in md_abs_paths] except Exception: - return [] - proj = data.get("project", {}) or {} - raw = proj.get("dependencies", []) or [] - return [d.split(";")[0].strip() for d in raw if isinstance(d, str)] - + # fallback to absolute paths if relative conversion fails + md_rel_for_repomix = md_abs_paths -# Notebook conversion -def notebook_to_md(nb_path: Path) -> str: - """ - Convert Jupyter notebook to Markdown. - - Parameters - ---------- - nb_path : Path - Path to .ipynb file. + # Include README if present (use relative path so repomix sees it correctly) + if readme_file.exists(): + print("Including top-level README.md in repomix inputs") + md_rel_for_repomix.append(Path("README.md")) - Returns - ------- - str - Markdown text with markdown and code cells. - """ - nb = nbformat.read(str(nb_path), as_version=4) - out: List[str] = [f"# Notebook: {nb_path.name}", ""] - for cell in nb.cells: - src = "".join(cell.get("source", "")) if isinstance(cell.get("source", ""), list) else cell.get("source", "") - if cell.get("cell_type") == "markdown": - out.append(src.rstrip()) - out.append("") - elif cell.get("cell_type") == "code" and src.strip(): - out.extend(["```python", src.strip("\n"), "```", ""]) - return "\n".join(out).rstrip() + "\n" + # ---- Compact: examples only ---- + run_repomix_with_file_list(md_rel_for_repomix, compact_output_file, repo_cwd=base_dir, include_patterns="**/*.md") - -def convert_examples_to_md(src: Path, out: Path) -> List[Path]: - """ - Convert all .ipynb notebooks in a directory tree to Markdown. - - Parameters - ---------- - src : Path - Source examples/ directory. - out : Path - Destination directory for converted .md files. - - Returns - ------- - list of Path - List of generated Markdown file paths. - """ - created: List[Path] = [] - if not src.exists(): - return created - out.mkdir(parents=True, exist_ok=True) - for nb in sorted(src.rglob("*.ipynb")): + # ---- Full: examples + bayesflow .py files ---- + if src_dir.exists(): + py_abs_paths = collect_bayesflow_py_abs_paths(src_dir) + # convert py paths to relative if possible + try: + py_rel_for_repomix = [p.relative_to(base_dir) for p in py_abs_paths] + except Exception: + py_rel_for_repomix = py_abs_paths + + full_list = md_rel_for_repomix + py_rel_for_repomix + run_repomix_with_file_list(full_list, full_output_file, repo_cwd=base_dir, include_patterns="**/*.py,**/*.md") + else: + print("Skipping creation of full context because bayesflow directory was not found.", file=sys.stderr) + +finally: + # Clean up only the temporary files / dir we created + if created_files: + for p in created_files: + try: + if p.exists(): + p.unlink() + print("Removed temporary md:", p) + except Exception as e: + print(f"Warning: failed to remove {p}: {e}", file=sys.stderr) + + if temp_dir_path and temp_dir_path.exists(): try: - dst = out / (nb.stem + ".md") - dst.write_text(notebook_to_md(nb), encoding="utf8") - created.append(dst) - logging.info("Converted %s -> %s", nb, dst) + shutil.rmtree(temp_dir_path) + print("Removed temporary directory:", temp_dir_path) except Exception as e: - logging.warning("Failed to convert %s: %s", nb, e) - return created - - -# Context generation -def run_repomix_on_paths(paths: List[str], style: str = "markdown") -> str: - """ - Run repomix on given paths. - - Parameters - ---------- - paths : list of str - Relative paths to include. - style : str - Output style, default 'markdown'. - - Returns - ------- - str - Repomix output. - """ - cmd = ["repomix", "--style", style, "--stdin", "--stdout"] - return run(cmd, input_text="\n".join(paths) + "\n") - - -def generate_compact(full_text: str, tag: str, repo_root: Path, conv_examples_dir: Path) -> str: - """ - Create compact context file from full repomix output. - - Parameters - ---------- - full_text : str - Full repomix output. - tag : str - Release tag. - repo_root : Path - Repository root. - conv_examples_dir : Path - Path to temporary converted examples. - - Returns - ------- - str - Compact context content. - """ - lines = full_text.splitlines(keepends=True) - sections: List[Tuple[str, int, int]] = [] - cur_path: Optional[str] = None - cur_start = 0 - for i, line in enumerate(lines): - m = HEADING_RE.match(line) - if m: - if cur_path is not None: - sections.append((cur_path, cur_start, i)) - cur_path = m.group("path").strip() - cur_start = i + 1 - if cur_path is not None: - sections.append((cur_path, cur_start, len(lines))) - - if not sections: - return f"\n\n{''.join(lines[:40])}" - - out_lines: List[str] = [f"\n\n"] - preview_lines = 40 - max_keep_tokens = 1200 - for idx, (path, s, e) in enumerate(sections, start=1): - seg = "".join(lines[s:e]) - path_lower = path.lower() - keep_full = ( - path_lower.endswith("readme.md") - or path_lower.startswith("examples") - or (conv_examples_dir.exists() and (conv_examples_dir / Path(path).name).exists()) - or (path.startswith("bayesflow") and token_estimate(seg) <= max_keep_tokens) - ) - out_lines.append(f"## {path} \n\n") - if keep_full: - out_lines.append(seg + "\n") - else: - out_lines.append("".join(seg.splitlines(keepends=True)[:preview_lines])) - out_lines.append(f"\n> [TRUNCATED] See full file for `{path}` lines {s + 1}-{e}.\n\n") - return "".join(out_lines) - - -# Build pipeline -def build(tag: Optional[str], out_dir: Path): - """ - Generate full + compact context files and manifest. - - Parameters - ---------- - tag : str or None - Release tag. If None, inferred from environment or commit hash. - out_dir : Path - Destination directory. - - Returns - ------- - tuple of Path - (full_file, compact_file, manifest_file) - """ - out_dir.mkdir(parents=True, exist_ok=True) - deps = load_dependency_summary(PYPROJECT) - dep_md = "**Dependency summary:**\n" + "\n".join(f"- {d}" for d in deps) + "\n\n" if deps else "" - - with tempfile.TemporaryDirectory(prefix="bf-conv-") as tmp: - tmp_path = Path(tmp) - convert_examples_to_md(ROOT / "examples", tmp_path) - repomix_inputs = [str(p) for p in INCLUDE_FOLDERS if (ROOT / p).exists()] - if tmp_path.exists(): - repomix_inputs.append(str(tmp_path)) - for f in INCLUDE_FILES: - if (ROOT / f).exists(): - repomix_inputs.append(f) - repomix_out = run_repomix_on_paths(repomix_inputs, style="markdown") - - try: - commit = run(["git", "rev-parse", "HEAD"]).strip() - except Exception: - commit = None - - tag = ( - tag - or os.environ.get("RELEASE_TAG") - or (commit[:7] if commit else datetime.datetime.utcnow().strftime("%Y%m%d")) - ) - header = { - "artifact": f"bayesflow-context-full-{tag}.md", - "tag": tag, - "commit": commit, - "generated_at": datetime.datetime.utcnow().isoformat() + "Z", - } - header_block = ["---"] + [f"{k}: {v}" for k, v in header.items() if v] + ["---", ""] - - full_text = "\n".join(header_block) + dep_md + repomix_out - full_path = out_dir / f"bayesflow-context-full-{tag}.md" - full_path.write_text(full_text, encoding="utf8") - - compact_text = generate_compact(repomix_out, tag, ROOT, tmp_path) - compact_path = out_dir / f"bayesflow-context-compact-{tag}.md" - compact_path.write_text("\n".join(header_block) + dep_md + compact_text, encoding="utf8") - - manifest = { - "tag": tag, - "commit": commit, - "generated_at": header["generated_at"], - "dependency_summary": deps, - "files": { - full_path.name: {"size_bytes": full_path.stat().st_size}, - compact_path.name: {"size_bytes": compact_path.stat().st_size}, - }, - } - manifest_path = out_dir / f"bayesflow-context-{tag}.manifest.json" - manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf8") - - logging.info("Built artifacts: %s, %s, %s", full_path, compact_path, manifest_path) - return full_path, compact_path, manifest_path - - -def main(argv=None): - """ - CLI entrypoint. - - Parameters - ---------- - argv : list of str, optional - Command-line arguments. - - Returns - ------- - int - Exit status code. - """ - parser = argparse.ArgumentParser(description="Build BayesFlow LLM context (full + compact).") - parser.add_argument("--tag", type=str, default=None) - args = parser.parse_args(argv) - build(args.tag, OUT_DIR) - return 0 - + print(f"Warning: failed to remove temporary directory {temp_dir_path}: {e}", file=sys.stderr) -if __name__ == "__main__": - raise SystemExit(main()) +print("Done.") From e88f858aebd59ce6e098375f265e0582c08affda Mon Sep 17 00:00:00 2001 From: elseml Date: Fri, 12 Sep 2025 11:20:31 +0200 Subject: [PATCH 03/12] Exclude experimental folders from llm context --- llm_context/build_llm_context.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index 2d0a6f62f..3b054134f 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 """ -Build two repomix LLM-context files, but write converted .md files into a temporary directory -so the real examples/ folder is never modified. +Builds two repomix LLM-context files: - - llm_context/llm_context_compact.md -> examples only (from temp dir) - - llm_context/llm_context_full.md -> examples (temp dir) + bayesflow source code + - llm_context/llm_context_compact.md -> README + examples only + - llm_context/llm_context_full.md -> README + examples + bayesflow source code + + .ipynb files from examples/ are temporarily converted to .md for clean repomix conversion. """ import json import subprocess @@ -23,6 +24,8 @@ compact_output_file = output_dir / "llm_context_compact.md" full_output_file = output_dir / "llm_context_full.md" +EXCLUDED_DIR_NAMES = ["experimental"] + # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) @@ -77,9 +80,15 @@ def convert_notebooks_to_md_in_temp(src_examples_dir: Path, temp_examples_dir: P return md_paths, created_paths -def collect_bayesflow_py_abs_paths(src_bayesflow_dir: Path): - """Return a sorted list of absolute Paths for all .py files in src_bayesflow_dir.""" - return sorted(p.resolve() for p in sorted(src_bayesflow_dir.rglob("*.py"))) +def collect_bayesflow_py_abs_paths(src_bayesflow_dir: Path, excluded_dir_names=EXCLUDED_DIR_NAMES): + """Return a sorted list of absolute Paths for all .py files in src_bayesflow_dir, + excluding any found under directories whose name is in excluded_dir_names (at any depth).""" + excluded = set(excluded_dir_names) + return sorted( + p.resolve() + for p in src_bayesflow_dir.rglob("*.py") + if not any(parent.name in excluded for parent in p.parents) + ) def run_repomix_with_file_list(file_paths, output_path, repo_cwd, include_patterns="**/*.py,**/*.md"): """Run repomix (cwd=repo_cwd) with --stdin reading newline-separated paths (absolute or relative).""" @@ -92,7 +101,6 @@ def run_repomix_with_file_list(file_paths, output_path, repo_cwd, include_patter "--style", "markdown", "--stdin", "--include", include_patterns, - "--ignore", "bayesflow/experimental/", "-o", str(output_path), ] print(f"Running repomix in cwd={repo_cwd}: {' '.join(cmd)}") From defdb94df001eadbe21836ae9003e25599063149 Mon Sep 17 00:00:00 2001 From: elseml Date: Fri, 12 Sep 2025 11:58:46 +0200 Subject: [PATCH 04/12] Cleanup llm context generation --- .gitignore | 2 + llm_context/README.md | 11 +- llm_context/build_llm_context.py | 303 +++++++++++++++++++------------ 3 files changed, 191 insertions(+), 125 deletions(-) diff --git a/.gitignore b/.gitignore index 1ca9eaef6..b9fceff4e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ docsrc/source/contributing.md examples/checkpoints/ build docs/ +llm_context/llm_context_compact.md +llm_context/llm_context_full.md # mypy diff --git a/llm_context/README.md b/llm_context/README.md index e5993d565..ca4d795b5 100644 --- a/llm_context/README.md +++ b/llm_context/README.md @@ -3,17 +3,14 @@ This folder contains single-file context artifacts to improve LLM assistance for BayesFlow. ## Files -- `bayesflow-context-full-.md` - Full Markdown snapshot: README, examples (converted to Markdown), and all `bayesflow/` code. - - `bayesflow-context-compact-.md` - Smaller snapshot: README + examples fully, `bayesflow/` code partially (truncated previews). + Smaller snapshot: README + examples -- `bayesflow-context-.manifest.json` - Metadata (tag, commit, dependencies, file sizes). +- `bayesflow-context-full-.md` + Full Markdown snapshot: README + examples + source code (`bayesflow/`). ## Usage -1. Download either the full or compact file for the release tag of interest: The compact file is cheaper and faster; the full file is most accurate. +1. Download either the compact or full file for the release tag of interest: The compact file is cheaper and faster; the full file is most accurate. 2. Paste it into your LLM context before asking questions about BayesFlow. ## Prompt Tip diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index 3b054134f..e1bfb4d47 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -1,25 +1,26 @@ #!/usr/bin/env python3 """ -Builds two repomix LLM-context files: +Build two Repomix LLM-context files: - - llm_context/llm_context_compact.md -> README + examples only - - llm_context/llm_context_full.md -> README + examples + bayesflow source code +- llm_context/llm_context_compact.md -> README + examples only +- llm_context/llm_context_full.md -> README + examples + bayesflow source code - .ipynb files from examples/ are temporarily converted to .md for clean repomix conversion. +Example notebooks (.ipynb) are converted to temporary Markdown files for clean Repomix conversion. """ +from __future__ import annotations + import json import subprocess -from pathlib import Path -import tempfile import sys -import shutil +import tempfile +from pathlib import Path +from typing import Iterable, List, Sequence +# --- Paths and config --- base_dir = Path(__file__).parent.parent.resolve() -print("base_dir:", base_dir) - +readme_file = base_dir / "README.md" examples_dir = base_dir / "examples" src_dir = base_dir / "bayesflow" -readme_file = base_dir / "README.md" output_dir = base_dir / "llm_context" compact_output_file = output_dir / "llm_context_compact.md" full_output_file = output_dir / "llm_context_full.md" @@ -36,144 +37,210 @@ if not src_dir.exists(): print(f"WARNING: bayesflow source directory not found: {src_dir} -- full context will be skipped.", file=sys.stderr) -def convert_notebooks_to_md_in_temp(src_examples_dir: Path, temp_examples_dir: Path): + +def convert_notebooks_to_md_in_temp(src_examples_dir: Path, temp_examples_dir: Path) -> List[Path]: """ - Convert .ipynb files to .md and write them into temp_examples_dir. - Returns: - - list of Path objects (absolute) to the markdown files created (for repomix input) - - list of actual file paths created (for cleanup) + Convert Jupyter notebooks (*.ipynb) in a source directory to Markdown files. + + Notes are saved into a temporary examples directory, leaving the original examples/ + untouched. Markdown files are created with code cells fenced as Python blocks. + + Parameters + ---------- + src_examples_dir : Path + Directory containing the source *.ipynb notebooks (non-recursive). + temp_examples_dir : Path + Temporary directory where the generated *.md files will be written. + + Returns + ------- + List[Path] + Absolute paths to the created Markdown files. + + Raises + ------ + SystemExit + If no notebooks are found or conversion yields no Markdown content. """ - created_paths = [] - md_paths = [] + created_md_paths: List[Path] = [] for ipynb_file in sorted(src_examples_dir.glob("*.ipynb")): - with open(ipynb_file, "r", encoding="utf-8") as f: + with ipynb_file.open("r", encoding="utf-8") as f: notebook = json.load(f) - parts = [] + parts: List[str] = [] for cell in notebook.get("cells", []): - if cell.get("cell_type") == "markdown": - parts.append("".join(cell.get("source", []))) - elif cell.get("cell_type") == "code": - parts.append("```python\n" + "".join(cell.get("source", [])) + "\n```") + ctype = cell.get("cell_type") + src = "".join(cell.get("source", [])) + if ctype == "markdown": + parts.append(src) + elif ctype == "code": + parts.append(f"```python\n{src}\n```") + + # Skip empty conversions (e.g., empty notebook) + if not parts: + continue - # write into the temporary examples directory (never into the real examples/) md_file = temp_examples_dir / f"{ipynb_file.stem}.md" - # ensure unique name just in case (temp dir typically empty, but keep behaviour consistent) - if md_file.exists(): - i = 1 - while True: - candidate = temp_examples_dir / f"{ipynb_file.stem}.repomix.{i}.md" - if not candidate.exists(): - md_file = candidate - break - i += 1 - - with open(md_file, "w", encoding="utf-8") as f: + with md_file.open("w", encoding="utf-8") as f: f.write("\n\n".join(parts)) - created_paths.append(md_file) - md_paths.append(md_file.resolve()) + created_md_paths.append(md_file.resolve()) + + if not created_md_paths: + raise FileNotFoundError("No example notebooks (*.ipynb) found or conversion produced no markdown files.") - print("Created temporary md:", md_file) + return created_md_paths - return md_paths, created_paths -def collect_bayesflow_py_abs_paths(src_bayesflow_dir: Path, excluded_dir_names=EXCLUDED_DIR_NAMES): - """Return a sorted list of absolute Paths for all .py files in src_bayesflow_dir, - excluding any found under directories whose name is in excluded_dir_names (at any depth).""" +def collect_py_abs_paths(dir: Path, excluded_dir_names: Sequence[str] = EXCLUDED_DIR_NAMES) -> List[Path]: + """ + Collect absolute paths to Python files under a directory, excluding certain folder names. + + Parameters + ---------- + dir : Path + Root directory to scan for *.py files (recursive). + excluded_dir_names : Sequence[str], optional + Directory names to exclude at any depth, e.g., experimental folders. + + Returns + ------- + List[Path] + Sorted list of absolute paths to included Python files. + """ excluded = set(excluded_dir_names) return sorted( p.resolve() - for p in src_bayesflow_dir.rglob("*.py") + for p in dir.rglob("*.py") if not any(parent.name in excluded for parent in p.parents) ) -def run_repomix_with_file_list(file_paths, output_path, repo_cwd, include_patterns="**/*.py,**/*.md"): - """Run repomix (cwd=repo_cwd) with --stdin reading newline-separated paths (absolute or relative).""" + +def run_repomix_with_file_list( + file_paths: Sequence[Path], + output_path: Path, + repo_cwd: Path, +) -> None: + """ + Run Repomix to bundle a list of files into a single Markdown output. + + Parameters + ---------- + file_paths : Sequence[Path] + Files to include in the Repomix run. Paths may be absolute or relative to repo_cwd. + output_path : Path + Destination for the generated Markdown output. + repo_cwd : Path + Repository root to use as the working directory for Repomix. + + Raises + ------ + ValueError + If file_paths is empty. + FileNotFoundError + If the 'repomix' executable is not found on PATH. + RuntimeError + If the Repomix command fails. + """ if not file_paths: - print(f"No files provided for repomix output {output_path}. Skipping.", file=sys.stderr) - return + raise ValueError(f"No files provided for repomix output: {output_path}") cmd = [ "repomix", - "--style", "markdown", + "--style", + "markdown", "--stdin", - "--include", include_patterns, - "-o", str(output_path), + "-o", + str(output_path), ] - print(f"Running repomix in cwd={repo_cwd}: {' '.join(cmd)}") - print(f" -> {len(file_paths)} files (showing up to 20):") - for p in file_paths[:20]: - print(" ", str(p)) + # Prepare file path list stdin_input = "\n".join(str(p) for p in file_paths) + "\n" - subprocess.run(cmd, input=stdin_input, text=True, check=True, cwd=str(repo_cwd)) - print(f"✅ Repomix packaged output saved to {output_path}") - -# --- Main flow --- -# Create a temporary examples directory *under the repo root* so repomix can use relative paths if it wants. -temp_dir_path = None -created_files = [] - -try: - temp_dir = tempfile.mkdtemp(prefix=".examples_temporary_", dir=str(base_dir)) - temp_examples_dir = Path(temp_dir) - temp_dir_path = temp_examples_dir - print("Using temporary examples dir:", temp_examples_dir) - - # Convert notebooks into the temp folder (no changes in the real examples/ directory) - md_abs_paths, created_files = convert_notebooks_to_md_in_temp(examples_dir, temp_examples_dir) - if not md_abs_paths: - print("ERROR: No example notebooks (*.ipynb) found or conversion produced no markdown files.", file=sys.stderr) - raise SystemExit(1) - - # For repomix we can pass relative paths (relative to repo root) — convert if possible + try: - md_rel_for_repomix = [p.relative_to(base_dir) for p in md_abs_paths] - except Exception: - # fallback to absolute paths if relative conversion fails - md_rel_for_repomix = md_abs_paths - - # Include README if present (use relative path so repomix sees it correctly) - if readme_file.exists(): - print("Including top-level README.md in repomix inputs") - md_rel_for_repomix.append(Path("README.md")) - - # ---- Compact: examples only ---- - run_repomix_with_file_list(md_rel_for_repomix, compact_output_file, repo_cwd=base_dir, include_patterns="**/*.md") - - # ---- Full: examples + bayesflow .py files ---- - if src_dir.exists(): - py_abs_paths = collect_bayesflow_py_abs_paths(src_dir) - # convert py paths to relative if possible + subprocess.run(cmd, input=stdin_input, text=True, check=True, cwd=str(repo_cwd)) + except FileNotFoundError as e: + raise FileNotFoundError("'repomix' not found on PATH. Please install it and retry.") from e + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Repomix failed with exit code {e.returncode}.") from e + + print(f"Repomix packaged output saved to {output_path}") + + +def to_relative_paths(paths: Iterable[Path], base: Path) -> List[Path]: + """ + Convert a list of paths to paths relative to a base directory when possible. + + Parameters + ---------- + paths : Iterable[Path] + Paths to convert. + base : Path + Base directory. + + Returns + ------- + List[Path] + Relative paths if conversion succeeds; otherwise original paths. + """ + rels: List[Path] = [] + for p in paths: try: - py_rel_for_repomix = [p.relative_to(base_dir) for p in py_abs_paths] + rels.append(p.relative_to(base)) except Exception: - py_rel_for_repomix = py_abs_paths - - full_list = md_rel_for_repomix + py_rel_for_repomix - run_repomix_with_file_list(full_list, full_output_file, repo_cwd=base_dir, include_patterns="**/*.py,**/*.md") - else: - print("Skipping creation of full context because bayesflow directory was not found.", file=sys.stderr) - -finally: - # Clean up only the temporary files / dir we created - if created_files: - for p in created_files: - try: - if p.exists(): - p.unlink() - print("Removed temporary md:", p) - except Exception as e: - print(f"Warning: failed to remove {p}: {e}", file=sys.stderr) - - if temp_dir_path and temp_dir_path.exists(): - try: - shutil.rmtree(temp_dir_path) - print("Removed temporary directory:", temp_dir_path) - except Exception as e: - print(f"Warning: failed to remove temporary directory {temp_dir_path}: {e}", file=sys.stderr) + rels.append(p) + return rels + -print("Done.") +def main() -> None: + """ + Entry point to build compact and full LLM context bundles. + + - Compact: README + example notebooks (converted to Markdown) + - Full: Compact + all bayesflow/*.py files (excluding certain directories) + """ + # Validate required inputs + if not readme_file.exists(): + raise FileNotFoundError(f"README.md file not found: {readme_file}") + if not examples_dir.exists(): + raise FileNotFoundError(f"examples directory not found: {examples_dir}") + if not src_dir.exists(): + raise FileNotFoundError(f"bayesflow source directory not found: {src_dir}") + + # Prepare temporary examples directory under repo root so Repomix can use relative paths. + with tempfile.TemporaryDirectory(prefix=".examples_temporary_", dir=str(base_dir)) as tmpdir: + temp_examples_dir = Path(tmpdir) + + # Convert notebooks into the temp folder (no changes in the real examples/ directory) + md_abs_paths = convert_notebooks_to_md_in_temp(examples_dir, temp_examples_dir) + + # Prefer relative paths for Repomix + md_for_repomix = to_relative_paths(md_abs_paths, base_dir) + + # Include README if present (relative path so Repomix sees it correctly) + if readme_file.exists(): + md_for_repomix.append(Path("README.md")) + + # ---- Compact: examples only ---- + run_repomix_with_file_list( + md_for_repomix, + compact_output_file, + repo_cwd=base_dir + ) + + # ---- Full: examples + bayesflow .py files ---- + py_abs_paths = collect_py_abs_paths(src_dir) + if not py_abs_paths: + raise FileNotFoundError(f"No Python files found in bayesflow source directory: {src_dir}") + py_for_repomix = to_relative_paths(py_abs_paths, base_dir) + full_list = [*md_for_repomix, *py_for_repomix] + run_repomix_with_file_list( + full_list, + full_output_file, + repo_cwd=base_dir + ) + +if __name__ == "__main__": + main() \ No newline at end of file From 130492e111a7ad94ada064ab67fae168b7cdeb35 Mon Sep 17 00:00:00 2001 From: elseml Date: Tue, 16 Sep 2025 17:07:01 +0200 Subject: [PATCH 05/12] Switch from repomix to gitingest for llm context generation --- .github/workflows/build-llm-context.yaml | 31 +-- .gitignore | 3 +- llm_context/build_llm_context.py | 315 ++++++++--------------- llm_context/requirements.txt | 4 +- 4 files changed, 116 insertions(+), 237 deletions(-) diff --git a/.github/workflows/build-llm-context.yaml b/.github/workflows/build-llm-context.yaml index 76a3663c0..846b127a8 100644 --- a/.github/workflows/build-llm-context.yaml +++ b/.github/workflows/build-llm-context.yaml @@ -1,40 +1,33 @@ -name: Build BayesFlow LLM Context (full + compact) +name: Build LLM Context on: - workflow_dispatch: release: types: [published] -permissions: - contents: write - jobs: - build: + build-context: runs-on: ubuntu-latest + steps: - - name: Checkout + - name: Checkout repository uses: actions/checkout@v4 - with: - fetch-depth: 1 - - name: Setup Python - uses: actions/setup-python@v4 + - name: Set up Python + uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: "3.11" - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r llm_context/requirements.txt - - name: Build artifacts + - name: Build LLM context files run: | - python llm_context/build_llm_context.py --tag "${{ github.event.release.tag_name }}" + TAG="${GITHUB_REF_NAME}" + python scripts/build_llm_context.py "$TAG" - - name: Upload artifacts to release + - name: Upload context files as release assets uses: softprops/action-gh-release@v2 with: - files: | - llm_context/bayesflow-context-full-${{ github.event.release.tag_name }}.md - llm_context/bayesflow-context-compact-${{ github.event.release.tag_name }}.md - llm_context/bayesflow-context-${{ github.event.release.tag_name }}.manifest.json + files: llm_context/llm_context_*.md diff --git a/.gitignore b/.gitignore index b9fceff4e..f317dac8d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,8 +13,7 @@ docsrc/source/contributing.md examples/checkpoints/ build docs/ -llm_context/llm_context_compact.md -llm_context/llm_context_full.md +llm_context/llm_context* # mypy diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index e1bfb4d47..b2e44b82a 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -1,246 +1,135 @@ -#!/usr/bin/env python3 """ -Build two Repomix LLM-context files: +Build compact and full Gitingest LLM-context bundles. -- llm_context/llm_context_compact.md -> README + examples only -- llm_context/llm_context_full.md -> README + examples + bayesflow source code +On release, generates: -Example notebooks (.ipynb) are converted to temporary Markdown files for clean Repomix conversion. +- llm_context/llm_context_compact_.md +- llm_context/llm_context_full_.md + +Old context files in ``llm_context/`` are removed before writing new ones. """ from __future__ import annotations import json +import shutil import subprocess import sys import tempfile from pathlib import Path -from typing import Iterable, List, Sequence +from typing import List, Sequence # --- Paths and config --- -base_dir = Path(__file__).parent.parent.resolve() -readme_file = base_dir / "README.md" -examples_dir = base_dir / "examples" -src_dir = base_dir / "bayesflow" -output_dir = base_dir / "llm_context" -compact_output_file = output_dir / "llm_context_compact.md" -full_output_file = output_dir / "llm_context_full.md" +BASE_DIR = Path(__file__).parent.parent.resolve() +README_FILE = BASE_DIR / "README.md" +EXAMPLES_DIR = BASE_DIR / "examples" +SRC_DIR = BASE_DIR / "bayesflow" +OUTPUT_DIR = BASE_DIR / "llm_context" EXCLUDED_DIR_NAMES = ["experimental"] -# Ensure output directory exists -output_dir.mkdir(parents=True, exist_ok=True) - -# Safety checks -if not examples_dir.exists(): - print(f"ERROR: examples directory not found: {examples_dir}", file=sys.stderr) - raise SystemExit(1) -if not src_dir.exists(): - print(f"WARNING: bayesflow source directory not found: {src_dir} -- full context will be skipped.", file=sys.stderr) - - -def convert_notebooks_to_md_in_temp(src_examples_dir: Path, temp_examples_dir: Path) -> List[Path]: - """ - Convert Jupyter notebooks (*.ipynb) in a source directory to Markdown files. - - Notes are saved into a temporary examples directory, leaving the original examples/ - untouched. Markdown files are created with code cells fenced as Python blocks. - - Parameters - ---------- - src_examples_dir : Path - Directory containing the source *.ipynb notebooks (non-recursive). - temp_examples_dir : Path - Temporary directory where the generated *.md files will be written. - - Returns - ------- - List[Path] - Absolute paths to the created Markdown files. - - Raises - ------ - SystemExit - If no notebooks are found or conversion yields no Markdown content. - """ - created_md_paths: List[Path] = [] - - for ipynb_file in sorted(src_examples_dir.glob("*.ipynb")): - with ipynb_file.open("r", encoding="utf-8") as f: - notebook = json.load(f) +def convert_notebooks_to_md(src_dir: Path, dst_dir: Path) -> List[Path]: + """Convert Jupyter notebooks (*.ipynb) to Markdown.""" + created: List[Path] = [] + for ipynb_file in sorted(src_dir.glob("*.ipynb")): + notebook = json.loads(ipynb_file.read_text(encoding="utf-8")) parts: List[str] = [] for cell in notebook.get("cells", []): - ctype = cell.get("cell_type") src = "".join(cell.get("source", [])) - if ctype == "markdown": + if cell.get("cell_type") == "markdown": parts.append(src) - elif ctype == "code": + elif cell.get("cell_type") == "code": parts.append(f"```python\n{src}\n```") - - # Skip empty conversions (e.g., empty notebook) - if not parts: - continue - - md_file = temp_examples_dir / f"{ipynb_file.stem}.md" - - with md_file.open("w", encoding="utf-8") as f: - f.write("\n\n".join(parts)) - - created_md_paths.append(md_file.resolve()) - - if not created_md_paths: - raise FileNotFoundError("No example notebooks (*.ipynb) found or conversion produced no markdown files.") - - return created_md_paths - - -def collect_py_abs_paths(dir: Path, excluded_dir_names: Sequence[str] = EXCLUDED_DIR_NAMES) -> List[Path]: - """ - Collect absolute paths to Python files under a directory, excluding certain folder names. - - Parameters - ---------- - dir : Path - Root directory to scan for *.py files (recursive). - excluded_dir_names : Sequence[str], optional - Directory names to exclude at any depth, e.g., experimental folders. - - Returns - ------- - List[Path] - Sorted list of absolute paths to included Python files. - """ - excluded = set(excluded_dir_names) + if parts: + md_file = dst_dir / f"{ipynb_file.stem}.md" + md_file.write_text("\n\n".join(parts), encoding="utf-8") + created.append(md_file.resolve()) + if not created: + raise FileNotFoundError("No example notebooks (*.ipynb) found.") + return created + + +def collect_py_files(root: Path, exclude: Sequence[str] = ()) -> List[Path]: + """Collect Python source files from a directory.""" + excluded = set(exclude) return sorted( - p.resolve() - for p in dir.rglob("*.py") - if not any(parent.name in excluded for parent in p.parents) + f.resolve() + for f in root.rglob("*.py") + if not any(p.name in excluded for p in f.parents) ) -def run_repomix_with_file_list( - file_paths: Sequence[Path], - output_path: Path, - repo_cwd: Path, -) -> None: - """ - Run Repomix to bundle a list of files into a single Markdown output. - - Parameters - ---------- - file_paths : Sequence[Path] - Files to include in the Repomix run. Paths may be absolute or relative to repo_cwd. - output_path : Path - Destination for the generated Markdown output. - repo_cwd : Path - Repository root to use as the working directory for Repomix. - - Raises - ------ - ValueError - If file_paths is empty. - FileNotFoundError - If the 'repomix' executable is not found on PATH. - RuntimeError - If the Repomix command fails. - """ - if not file_paths: - raise ValueError(f"No files provided for repomix output: {output_path}") - - cmd = [ - "repomix", - "--style", - "markdown", - "--stdin", - "-o", - str(output_path), - ] - - # Prepare file path list - stdin_input = "\n".join(str(p) for p in file_paths) + "\n" - +def run_gitingest(work_dir: Path, output: Path, exclude: Sequence[str] | None = None) -> None: + """Run gitingest on a directory.""" + cmd = ["gitingest", str(work_dir), "--output", str(output)] + if exclude: + for pat in exclude: + cmd.extend(["--exclude-pattern", pat]) try: - subprocess.run(cmd, input=stdin_input, text=True, check=True, cwd=str(repo_cwd)) - except FileNotFoundError as e: - raise FileNotFoundError("'repomix' not found on PATH. Please install it and retry.") from e + subprocess.run(cmd, check=True) + except FileNotFoundError: + sys.stderr.write("ERROR: 'gitingest' not found. Install and add to PATH.\n") + raise except subprocess.CalledProcessError as e: - raise RuntimeError(f"Repomix failed with exit code {e.returncode}.") from e - - print(f"Repomix packaged output saved to {output_path}") - - -def to_relative_paths(paths: Iterable[Path], base: Path) -> List[Path]: - """ - Convert a list of paths to paths relative to a base directory when possible. - - Parameters - ---------- - paths : Iterable[Path] - Paths to convert. - base : Path - Base directory. - - Returns - ------- - List[Path] - Relative paths if conversion succeeds; otherwise original paths. - """ - rels: List[Path] = [] - for p in paths: - try: - rels.append(p.relative_to(base)) - except Exception: - rels.append(p) - return rels + sys.stderr.write(f"ERROR: gitingest failed (exit code {e.returncode}).\n") + raise + print(f"Gitingest executed; output saved to {output}") def main() -> None: - """ - Entry point to build compact and full LLM context bundles. - - - Compact: README + example notebooks (converted to Markdown) - - Full: Compact + all bayesflow/*.py files (excluding certain directories) - """ - # Validate required inputs - if not readme_file.exists(): - raise FileNotFoundError(f"README.md file not found: {readme_file}") - if not examples_dir.exists(): - raise FileNotFoundError(f"examples directory not found: {examples_dir}") - if not src_dir.exists(): - raise FileNotFoundError(f"bayesflow source directory not found: {src_dir}") - - # Prepare temporary examples directory under repo root so Repomix can use relative paths. - with tempfile.TemporaryDirectory(prefix=".examples_temporary_", dir=str(base_dir)) as tmpdir: - temp_examples_dir = Path(tmpdir) - - # Convert notebooks into the temp folder (no changes in the real examples/ directory) - md_abs_paths = convert_notebooks_to_md_in_temp(examples_dir, temp_examples_dir) - - # Prefer relative paths for Repomix - md_for_repomix = to_relative_paths(md_abs_paths, base_dir) - - # Include README if present (relative path so Repomix sees it correctly) - if readme_file.exists(): - md_for_repomix.append(Path("README.md")) - - # ---- Compact: examples only ---- - run_repomix_with_file_list( - md_for_repomix, - compact_output_file, - repo_cwd=base_dir - ) - - # ---- Full: examples + bayesflow .py files ---- - py_abs_paths = collect_py_abs_paths(src_dir) - if not py_abs_paths: - raise FileNotFoundError(f"No Python files found in bayesflow source directory: {src_dir}") - py_for_repomix = to_relative_paths(py_abs_paths, base_dir) - full_list = [*md_for_repomix, *py_for_repomix] - run_repomix_with_file_list( - full_list, - full_output_file, - repo_cwd=base_dir - ) + """Build compact and full LLM context bundles with versioned filenames.""" + tag = (sys.argv[1] if len(sys.argv) > 1 else None) or "dev" + + if not README_FILE.exists(): + raise FileNotFoundError(f"Missing README.md: {README_FILE}") + if not EXAMPLES_DIR.exists(): + raise FileNotFoundError(f"Missing examples dir: {EXAMPLES_DIR}") + + # Clean old context files + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + for old in OUTPUT_DIR.glob("llm_context_*.md"): + old.unlink() + + compact_output = OUTPUT_DIR / f"llm_context_compact_{tag}.md" + full_output = OUTPUT_DIR / f"llm_context_full_{tag}.md" + + with ( + tempfile.TemporaryDirectory(prefix="examples_", dir=BASE_DIR) as tmp_examples, + tempfile.TemporaryDirectory(prefix="compact_", dir=BASE_DIR) as tmp_compact, + tempfile.TemporaryDirectory(prefix="full_", dir=BASE_DIR) as tmp_full, + ): + tmp_examples = Path(tmp_examples) + tmp_compact = Path(tmp_compact) + tmp_full = Path(tmp_full) + + # Convert notebooks + example_mds = convert_notebooks_to_md(EXAMPLES_DIR, tmp_examples) + + # ==== Compact bundle ==== + (tmp_compact / "examples").mkdir(parents=True, exist_ok=True) + shutil.copy(README_FILE, tmp_compact / "README.md") + for md in example_mds: + shutil.copy(md, tmp_compact / "examples" / md.name) + run_gitingest(tmp_compact, compact_output) + + # ==== Full bundle ==== + (tmp_full / "examples").mkdir(parents=True, exist_ok=True) + shutil.copy(README_FILE, tmp_full / "README.md") + for md in example_mds: + shutil.copy(md, tmp_full / "examples" / md.name) + + if SRC_DIR.exists(): + for pyfile in collect_py_files(SRC_DIR, EXCLUDED_DIR_NAMES): + rel = pyfile.relative_to(SRC_DIR) + dest = tmp_full / "bayesflow" / rel + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(pyfile, dest) + else: + sys.stderr.write(f"WARNING: source dir not found: {SRC_DIR}\n") + + exclude = [f"**/{d}/**" for d in EXCLUDED_DIR_NAMES] if EXCLUDED_DIR_NAMES else None + run_gitingest(tmp_full, full_output, exclude) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/llm_context/requirements.txt b/llm_context/requirements.txt index 91799b7c0..838872f40 100644 --- a/llm_context/requirements.txt +++ b/llm_context/requirements.txt @@ -1,3 +1 @@ -repomix -nbformat -tomli +gitingest \ No newline at end of file From 9d1649925f5157cd232fee0541cbb04553ccdb5f Mon Sep 17 00:00:00 2001 From: elseml Date: Tue, 16 Sep 2025 17:31:14 +0200 Subject: [PATCH 06/12] Update llm context readme --- llm_context/README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/llm_context/README.md b/llm_context/README.md index ca4d795b5..8e4c49da5 100644 --- a/llm_context/README.md +++ b/llm_context/README.md @@ -4,17 +4,22 @@ This folder contains single-file context artifacts to improve LLM assistance for ## Files - `bayesflow-context-compact-.md` - Smaller snapshot: README + examples + Smaller snapshot: README + examples; ~ 50k tokens. - `bayesflow-context-full-.md` - Full Markdown snapshot: README + examples + source code (`bayesflow/`). + Full Markdown snapshot: README + examples + source code (`bayesflow/`); ~ 250k tokens. ## Usage 1. Download either the compact or full file for the release tag of interest: The compact file is cheaper and faster; the full file is most accurate. 2. Paste it into your LLM context before asking questions about BayesFlow. ## Prompt Tip -You are answering questions about BayesFlow using only the provided context .md file. If using code, cite the file or notebook name shown in the context. +### Compact File +You are answering questions about BayesFlow using the provided context .md file containing all BayesFlow tutorials. If needed, look up the latest source code from the BayesFlow documentation. +QUESTION: + +### Full File +You are answering questions about BayesFlow using only the provided context .md file containing all BayesFlow tutorials as well as the BayesFlow source code. QUESTION: ## Disclaimer From eca094d0e5ce51383d69d320b3796db4a880bd46 Mon Sep 17 00:00:00 2001 From: elseml Date: Tue, 16 Sep 2025 17:48:32 +0200 Subject: [PATCH 07/12] Improve llm context documentation --- llm_context/build_llm_context.py | 83 ++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 4 deletions(-) diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index b2e44b82a..a2a091eef 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -29,28 +29,66 @@ def convert_notebooks_to_md(src_dir: Path, dst_dir: Path) -> List[Path]: - """Convert Jupyter notebooks (*.ipynb) to Markdown.""" + """ + Convert Jupyter notebooks (*.ipynb) to Markdown files. + + Parameters + ---------- + src_dir : Path + Source directory containing Jupyter notebooks. + dst_dir : Path + Destination directory where converted Markdown files will be written. + + Returns + ------- + List[Path] + List of paths to the generated Markdown files. + + Raises + ------ + FileNotFoundError + If no notebooks are found in `src_dir`. + """ created: List[Path] = [] + for ipynb_file in sorted(src_dir.glob("*.ipynb")): notebook = json.loads(ipynb_file.read_text(encoding="utf-8")) parts: List[str] = [] + for cell in notebook.get("cells", []): src = "".join(cell.get("source", [])) if cell.get("cell_type") == "markdown": parts.append(src) elif cell.get("cell_type") == "code": parts.append(f"```python\n{src}\n```") + if parts: md_file = dst_dir / f"{ipynb_file.stem}.md" md_file.write_text("\n\n".join(parts), encoding="utf-8") created.append(md_file.resolve()) + if not created: raise FileNotFoundError("No example notebooks (*.ipynb) found.") + return created def collect_py_files(root: Path, exclude: Sequence[str] = ()) -> List[Path]: - """Collect Python source files from a directory.""" + """ + Collect Python source files from a directory, excluding specified folders. + + Parameters + ---------- + root : Path + Root directory to search for Python files. + exclude : Sequence[str], optional + Names of directories to exclude from the search (default is empty). + + Returns + ------- + List[Path] + Sorted list of resolved paths to Python files. + """ excluded = set(exclude) return sorted( f.resolve() @@ -60,11 +98,30 @@ def collect_py_files(root: Path, exclude: Sequence[str] = ()) -> List[Path]: def run_gitingest(work_dir: Path, output: Path, exclude: Sequence[str] | None = None) -> None: - """Run gitingest on a directory.""" + """ + Run `gitingest` on a directory to generate an LLM context bundle. + + Parameters + ---------- + work_dir : Path + Directory to run gitingest on. + output : Path + Output Markdown file path where results will be saved. + exclude : Sequence[str] or None, optional + List of exclusion patterns for gitingest (default is None). + + Raises + ------ + FileNotFoundError + If `gitingest` is not installed or not found in PATH. + subprocess.CalledProcessError + If `gitingest` execution fails. + """ cmd = ["gitingest", str(work_dir), "--output", str(output)] if exclude: for pat in exclude: cmd.extend(["--exclude-pattern", pat]) + try: subprocess.run(cmd, check=True) except FileNotFoundError: @@ -73,11 +130,29 @@ def run_gitingest(work_dir: Path, output: Path, exclude: Sequence[str] | None = except subprocess.CalledProcessError as e: sys.stderr.write(f"ERROR: gitingest failed (exit code {e.returncode}).\n") raise + print(f"Gitingest executed; output saved to {output}") def main() -> None: - """Build compact and full LLM context bundles with versioned filenames.""" + """ + Build compact and full LLM context bundles with versioned filenames. + + Workflow + -------- + 1. Validate presence of README and examples directory. + 2. Remove old context files from the output directory. + 3. Convert Jupyter notebooks in `examples/` to Markdown. + 4. Build two bundles: + - Compact: README + examples + - Full: README + examples + source files (excluding certain directories) + 5. Run `gitingest` to generate Markdown bundles. + + Raises + ------ + FileNotFoundError + If required files or directories are missing. + """ tag = (sys.argv[1] if len(sys.argv) > 1 else None) or "dev" if not README_FILE.exists(): From e7800cbe997f83e3497fc72ea286e37096ef4eee Mon Sep 17 00:00:00 2001 From: elseml Date: Wed, 17 Sep 2025 12:09:44 +0200 Subject: [PATCH 08/12] Remove --exclude-pattern gitingest arg to fix Windows bug --- llm_context/build_llm_context.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index a2a091eef..f2955e356 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -8,6 +8,7 @@ Old context files in ``llm_context/`` are removed before writing new ones. """ + from __future__ import annotations import json @@ -90,14 +91,10 @@ def collect_py_files(root: Path, exclude: Sequence[str] = ()) -> List[Path]: Sorted list of resolved paths to Python files. """ excluded = set(exclude) - return sorted( - f.resolve() - for f in root.rglob("*.py") - if not any(p.name in excluded for p in f.parents) - ) + return sorted(f.resolve() for f in root.rglob("*.py") if not any(p.name in excluded for p in f.parents)) -def run_gitingest(work_dir: Path, output: Path, exclude: Sequence[str] | None = None) -> None: +def run_gitingest(work_dir: Path, output: Path) -> None: """ Run `gitingest` on a directory to generate an LLM context bundle. @@ -107,8 +104,6 @@ def run_gitingest(work_dir: Path, output: Path, exclude: Sequence[str] | None = Directory to run gitingest on. output : Path Output Markdown file path where results will be saved. - exclude : Sequence[str] or None, optional - List of exclusion patterns for gitingest (default is None). Raises ------ @@ -118,9 +113,6 @@ def run_gitingest(work_dir: Path, output: Path, exclude: Sequence[str] | None = If `gitingest` execution fails. """ cmd = ["gitingest", str(work_dir), "--output", str(output)] - if exclude: - for pat in exclude: - cmd.extend(["--exclude-pattern", pat]) try: subprocess.run(cmd, check=True) @@ -202,8 +194,7 @@ def main() -> None: else: sys.stderr.write(f"WARNING: source dir not found: {SRC_DIR}\n") - exclude = [f"**/{d}/**" for d in EXCLUDED_DIR_NAMES] if EXCLUDED_DIR_NAMES else None - run_gitingest(tmp_full, full_output, exclude) + run_gitingest(tmp_full, full_output) if __name__ == "__main__": From 1d23637b8103b725b4b6e2a4d2f00d22255c762a Mon Sep 17 00:00:00 2001 From: elseml Date: Wed, 17 Sep 2025 12:10:11 +0200 Subject: [PATCH 09/12] Add info for developers --- llm_context/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llm_context/README.md b/llm_context/README.md index 8e4c49da5..44d0d5e54 100644 --- a/llm_context/README.md +++ b/llm_context/README.md @@ -24,3 +24,10 @@ QUESTION: ## Disclaimer The context files are generated automatically and may be outdated or incomplete. While they aim at improving LLM accuracy, hallucinations may still occur frequently during LLM assistance. Please always refer to the official BayesFlow documentation and codebase for the most accurate information. + +## For Developers +The context files are automatically updated upon new BayesFlow releases by `.github/workflows/build-llm-context.yaml`. The script `llm_context/build_llm_context.py` can also be run manually with an optional `--tag ` argument (default: `dev`): +```bash +pip install -r llm_context/requirements.txt +python llm_context/build_llm_context.py --tag +``` From ed0741ec2deeb34f5a04cfc5373b13156be89273 Mon Sep 17 00:00:00 2001 From: elseml Date: Wed, 17 Sep 2025 16:23:06 +0200 Subject: [PATCH 10/12] Update README --- llm_context/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_context/README.md b/llm_context/README.md index 44d0d5e54..f313b0192 100644 --- a/llm_context/README.md +++ b/llm_context/README.md @@ -1,6 +1,6 @@ # BayesFlow LLM Context -This folder contains single-file context artifacts to improve LLM assistance for BayesFlow. +This folder contains context files to improve LLM assistance for BayesFlow. ## Files - `bayesflow-context-compact-.md` @@ -10,12 +10,12 @@ This folder contains single-file context artifacts to improve LLM assistance for Full Markdown snapshot: README + examples + source code (`bayesflow/`); ~ 250k tokens. ## Usage -1. Download either the compact or full file for the release tag of interest: The compact file is cheaper and faster; the full file is most accurate. +1. Download either the compact or full file for the current release tag: The compact file is cheaper and more focused; the full file contains the complete codebase. 2. Paste it into your LLM context before asking questions about BayesFlow. ## Prompt Tip ### Compact File -You are answering questions about BayesFlow using the provided context .md file containing all BayesFlow tutorials. If needed, look up the latest source code from the BayesFlow documentation. +You are answering questions about BayesFlow using the provided context .md file containing all BayesFlow tutorials. If needed, additionally look up the latest source code from the BayesFlow documentation. QUESTION: ### Full File From e630219a86ee2b95145f1ca5b4c80a2b1d442fda Mon Sep 17 00:00:00 2001 From: elseml Date: Thu, 18 Sep 2025 17:31:32 +0200 Subject: [PATCH 11/12] Add file exclusion (e.g., From_BayesFlow_1.1_to_2.0.ipynb to minimize BF1 context) --- llm_context/build_llm_context.py | 50 +++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index f2955e356..3868dd7e6 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -27,9 +27,12 @@ OUTPUT_DIR = BASE_DIR / "llm_context" EXCLUDED_DIR_NAMES = ["experimental"] +EXCLUDED_FILE_NAMES = ["From_BayesFlow_1.1_to_2.0.ipynb"] -def convert_notebooks_to_md(src_dir: Path, dst_dir: Path) -> List[Path]: +def convert_notebooks_to_md( + src_dir: Path, dst_dir: Path, exclude_files: Sequence[str] = () +) -> List[Path]: """ Convert Jupyter notebooks (*.ipynb) to Markdown files. @@ -39,6 +42,8 @@ def convert_notebooks_to_md(src_dir: Path, dst_dir: Path) -> List[Path]: Source directory containing Jupyter notebooks. dst_dir : Path Destination directory where converted Markdown files will be written. + exclude_files : Sequence[str], optional + File names to exclude from conversion. Returns ------- @@ -51,8 +56,12 @@ def convert_notebooks_to_md(src_dir: Path, dst_dir: Path) -> List[Path]: If no notebooks are found in `src_dir`. """ created: List[Path] = [] + excluded = set(exclude_files) for ipynb_file in sorted(src_dir.glob("*.ipynb")): + if ipynb_file.name in excluded: + continue + notebook = json.loads(ipynb_file.read_text(encoding="utf-8")) parts: List[str] = [] @@ -74,24 +83,33 @@ def convert_notebooks_to_md(src_dir: Path, dst_dir: Path) -> List[Path]: return created -def collect_py_files(root: Path, exclude: Sequence[str] = ()) -> List[Path]: +def collect_py_files( + root: Path, exclude_dirs: Sequence[str] = (), exclude_files: Sequence[str] = () +) -> List[Path]: """ - Collect Python source files from a directory, excluding specified folders. + Collect Python source files from a directory, excluding specified folders and files. Parameters ---------- root : Path Root directory to search for Python files. - exclude : Sequence[str], optional - Names of directories to exclude from the search (default is empty). + exclude_dirs : Sequence[str], optional + Names of directories to exclude from the search. + exclude_files : Sequence[str], optional + Names of files to exclude from the search. Returns ------- List[Path] Sorted list of resolved paths to Python files. """ - excluded = set(exclude) - return sorted(f.resolve() for f in root.rglob("*.py") if not any(p.name in excluded for p in f.parents)) + excluded_d = set(exclude_dirs) + excluded_f = set(exclude_files) + return sorted( + f.resolve() + for f in root.rglob("*.py") + if f.name not in excluded_f and not any(p.name in excluded_d for p in f.parents) + ) def run_gitingest(work_dir: Path, output: Path) -> None: @@ -134,10 +152,10 @@ def main() -> None: -------- 1. Validate presence of README and examples directory. 2. Remove old context files from the output directory. - 3. Convert Jupyter notebooks in `examples/` to Markdown. + 3. Convert Jupyter notebooks in `examples/` to Markdown, excluding specified files. 4. Build two bundles: - - Compact: README + examples - - Full: README + examples + source files (excluding certain directories) + - Compact: README + examples + - Full: README + examples + source files (excluding specified directories and files) 5. Run `gitingest` to generate Markdown bundles. Raises @@ -169,8 +187,10 @@ def main() -> None: tmp_compact = Path(tmp_compact) tmp_full = Path(tmp_full) - # Convert notebooks - example_mds = convert_notebooks_to_md(EXAMPLES_DIR, tmp_examples) + # Convert notebooks, respecting file exclusions + example_mds = convert_notebooks_to_md( + EXAMPLES_DIR, tmp_examples, EXCLUDED_FILE_NAMES + ) # ==== Compact bundle ==== (tmp_compact / "examples").mkdir(parents=True, exist_ok=True) @@ -186,7 +206,9 @@ def main() -> None: shutil.copy(md, tmp_full / "examples" / md.name) if SRC_DIR.exists(): - for pyfile in collect_py_files(SRC_DIR, EXCLUDED_DIR_NAMES): + for pyfile in collect_py_files( + SRC_DIR, EXCLUDED_DIR_NAMES, EXCLUDED_FILE_NAMES + ): rel = pyfile.relative_to(SRC_DIR) dest = tmp_full / "bayesflow" / rel dest.parent.mkdir(parents=True, exist_ok=True) @@ -198,4 +220,4 @@ def main() -> None: if __name__ == "__main__": - main() + main() \ No newline at end of file From f47cbf33efc3986d377c687cd2a146331f6ea527 Mon Sep 17 00:00:00 2001 From: elseml Date: Fri, 19 Sep 2025 16:00:31 +0200 Subject: [PATCH 12/12] Fix code style adherence --- llm_context/build_llm_context.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/llm_context/build_llm_context.py b/llm_context/build_llm_context.py index 3868dd7e6..4d500db78 100644 --- a/llm_context/build_llm_context.py +++ b/llm_context/build_llm_context.py @@ -30,9 +30,7 @@ EXCLUDED_FILE_NAMES = ["From_BayesFlow_1.1_to_2.0.ipynb"] -def convert_notebooks_to_md( - src_dir: Path, dst_dir: Path, exclude_files: Sequence[str] = () -) -> List[Path]: +def convert_notebooks_to_md(src_dir: Path, dst_dir: Path, exclude_files: Sequence[str] = ()) -> List[Path]: """ Convert Jupyter notebooks (*.ipynb) to Markdown files. @@ -83,9 +81,7 @@ def convert_notebooks_to_md( return created -def collect_py_files( - root: Path, exclude_dirs: Sequence[str] = (), exclude_files: Sequence[str] = () -) -> List[Path]: +def collect_py_files(root: Path, exclude_dirs: Sequence[str] = (), exclude_files: Sequence[str] = ()) -> List[Path]: """ Collect Python source files from a directory, excluding specified folders and files. @@ -188,9 +184,7 @@ def main() -> None: tmp_full = Path(tmp_full) # Convert notebooks, respecting file exclusions - example_mds = convert_notebooks_to_md( - EXAMPLES_DIR, tmp_examples, EXCLUDED_FILE_NAMES - ) + example_mds = convert_notebooks_to_md(EXAMPLES_DIR, tmp_examples, EXCLUDED_FILE_NAMES) # ==== Compact bundle ==== (tmp_compact / "examples").mkdir(parents=True, exist_ok=True) @@ -206,9 +200,7 @@ def main() -> None: shutil.copy(md, tmp_full / "examples" / md.name) if SRC_DIR.exists(): - for pyfile in collect_py_files( - SRC_DIR, EXCLUDED_DIR_NAMES, EXCLUDED_FILE_NAMES - ): + for pyfile in collect_py_files(SRC_DIR, EXCLUDED_DIR_NAMES, EXCLUDED_FILE_NAMES): rel = pyfile.relative_to(SRC_DIR) dest = tmp_full / "bayesflow" / rel dest.parent.mkdir(parents=True, exist_ok=True) @@ -220,4 +212,4 @@ def main() -> None: if __name__ == "__main__": - main() \ No newline at end of file + main()