From 3a118d2924ae70bb912c99526ff84ba7d87aae46 Mon Sep 17 00:00:00 2001 From: Jaseem Jas Date: Fri, 25 Jul 2025 15:04:45 +0530 Subject: [PATCH 1/2] feat: add MIT license and pre-commit configuration - Add MIT License file with Unstract Team copyright - Add comprehensive pre-commit configuration with: - Ruff for linting and formatting - MyPy for type checking - Bandit for security scanning - Standard hooks for code quality - Update pyproject.toml with bandit configuration - Update README.md development setup to use uv and include pre-commit - Install and configure pre-commit hooks This ensures consistent code quality and proper licensing for the project. --- .github/RELEASE.md | 16 ++++++---- .pre-commit-config.yaml | 65 ++++++++++++++++++++++++++++++++++++++++ CLAUDE.md | 4 ++- LICENSE | 21 +++++++++++++ README.md | 63 ++++++++++++++++++++++++-------------- pyproject.toml | 10 +++++++ test/test_client.py | 1 - test/test_integration.py | 1 - test/test_performance.py | 1 - 9 files changed, 150 insertions(+), 32 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 LICENSE diff --git a/.github/RELEASE.md b/.github/RELEASE.md index b8e6e78..3464118 100644 --- a/.github/RELEASE.md +++ b/.github/RELEASE.md @@ -46,7 +46,7 @@ The workflow follows semantic versioning: ### What Happens During Release 1. **Version Update**: Updates `__version__` in `src/apihub_client/__init__.py` -2. **Git Operations**: +2. **Git Operations**: - Commits version change to main branch - Creates and pushes git tag (e.g., `v1.2.3`) 3. **Quality Checks**: @@ -67,8 +67,9 @@ And these repository variables: - `PUSH_TO_MAIN_APP_ID`: GitHub App ID (can be same as secret) **PyPI Setup**: This workflow uses PyPI Trusted Publishers with `uv publish` for secure publishing. You need to: + 1. Configure the GitHub repository as a trusted publisher on PyPI -2. Set up the trusted publisher for the `apihub-python-client` package +2. Set up the trusted publisher for the `apihub-python-client` package 3. No API tokens required - `uv publish` automatically handles OIDC authentication ## Manual Release Workflow @@ -114,21 +115,25 @@ The `publish-on-release.yml` workflow runs when you manually create a release th ### Common Issues **Workflow fails at version bump:** + - Check that the current version in `__init__.py` follows semantic versioning - Ensure the main branch is up to date **Tests fail during release:** + - Check the latest test results on main branch - Fix failing tests before attempting release **PyPI publish fails:** -- Verify PyPI Trusted Publisher is configured correctly + +- Verify PyPI Trusted Publisher is configured correctly - Check if version already exists on PyPI - Ensure package builds successfully locally with `uv build` - Verify the repository and workflow file match the trusted publisher configuration - Check that `uv publish` has proper OIDC token access (requires `id-token: write` permission) **Permission errors:** + - Verify GitHub App has necessary permissions - Check that secrets and variables are properly configured @@ -180,6 +185,7 @@ To configure PyPI Trusted Publishers: - **Environment name**: Leave empty (unless using GitHub environments) 3. **Save the configuration** -For more details, see: +For more details, see: + - https://docs.pypi.org/trusted-publishers/ -- https://docs.astral.sh/uv/guides/publish/#trusted-publishing \ No newline at end of file +- https://docs.astral.sh/uv/guides/publish/#trusted-publishing diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5ec4f45 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,65 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - id: check-case-conflict + - id: check-merge-conflict + - id: check-toml + - id: debug-statements + - id: mixed-line-ending + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.15 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + additional_dependencies: + - types-requests + - pydantic + - typing-extensions + - pytest + - requests-mock + args: + [ + --ignore-missing-imports, + --no-strict-optional, + --allow-untyped-defs, + --disable-error-code=no-any-return, + ] + exclude: ^test/ + + - repo: https://github.com/PyCQA/bandit + rev: 1.7.5 + hooks: + - id: bandit + args: [-c, pyproject.toml] + additional_dependencies: ["bandit[toml]"] + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + types_or: [yaml, markdown, json] + exclude: ^(uv\.lock|\.coverage)$ + +ci: + autofix_commit_msg: | + [pre-commit.ci] auto fixes from pre-commit hooks + + for more information, see https://pre-commit.ci + autofix_prs: true + autoupdate_branch: "" + autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate" + autoupdate_schedule: weekly + skip: [] + submodules: false diff --git a/CLAUDE.md b/CLAUDE.md index f41baec..b79daab 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -5,6 +5,7 @@ This document contains critical information about working with this codebase. Fo ## Core Development Rules 1. Package Management + - ONLY use uv, NEVER pip - Installation: `uv add package` - Running tools: `uv run tool` @@ -12,6 +13,7 @@ This document contains critical information about working with this codebase. Fo - FORBIDDEN: `uv pip install`, `@latest` syntax 2. Code Quality + - Type hints required for all code - Functions must be focused and small - Follow existing patterns exactly @@ -23,7 +25,6 @@ This document contains critical information about working with this codebase. Fo - New features require tests - Bug fixes require regression tests - ## Python Tools ## Code Formatting @@ -44,6 +45,7 @@ This document contains critical information about working with this codebase. Fo ## Error Resolution 2. Common Issues + - Line length: - Break strings with parentheses - Multi-line function calls diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7646855 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Unstract Team + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 580c964..063c126 100644 --- a/README.md +++ b/README.md @@ -74,11 +74,11 @@ print(result) ) file_hash = initial_result.get("file_hash") print("File hash", file_hash) - discover_tables_result = client.wait_for_complete(file_hash, + discover_tables_result = client.wait_for_complete(file_hash, timeout=600, # max wait for 10 mins polling_interval=3 # polling every 3s ) - + tables = json.loads(discover_tables_result['data']) print(f"Total tables in this document: {len(tables)}") @@ -86,7 +86,7 @@ print(result) # Step 2: Extract specific table for i, table in enumerate(tables): table_result = client.extract( - endpoint="extract_table", + endpoint="extract_table", vertical="table", sub_vertical="extract_table", file_hash=file_hash, @@ -98,7 +98,7 @@ print(result) all_table_result.append({table["table_name"]: table_result}) print("All table result") - print(all_table_result) + print(all_table_result) ``` @@ -108,7 +108,7 @@ print(result) # Process bank statement result = client.extract( endpoint="bank_statement", - vertical="table", + vertical="table", sub_vertical="bank_statement", file_path="bank_statement.pdf", wait_for_completion=True, @@ -118,15 +118,13 @@ result = client.extract( print("Bank statement processed:", result) ``` - - ### Step-by-Step Processing ```python # Step 1: Start processing initial_result = client.extract( endpoint="discover_tables", - vertical="table", + vertical="table", sub_vertical="discover_tables", file_path="document.pdf" ) @@ -157,7 +155,7 @@ Once a file has been processed, you can reuse it by file hash: table_result = client.extract( endpoint="extract_table", vertical="table", - sub_vertical="extract_table", + sub_vertical="extract_table", file_hash="previously-obtained-hash", ext_table_no=1, # Extract second table. Indexing starts at 0 wait_for_completion=True @@ -202,6 +200,7 @@ client = ApiHubClient(api_key: str, base_url: str) ``` **Parameters:** + - `api_key` (str): Your API key for authentication - `base_url` (str): The base URL of the ApiHub service @@ -225,8 +224,9 @@ extract( ``` **Parameters:** + - `endpoint` (str): The API endpoint to call (e.g., "discover_tables", "extract_table") -- `vertical` (str): The processing vertical +- `vertical` (str): The processing vertical - `sub_vertical` (str): The processing sub-vertical - `file_path` (str, optional): Path to file for upload (for new files) - `file_hash` (str, optional): Hash of previously uploaded file (for cached operations) @@ -235,6 +235,7 @@ extract( - `**kwargs`: Additional parameters specific to the endpoint **Returns:** + - `dict`: API response containing processing results or file hash for tracking ##### get_status() @@ -246,9 +247,11 @@ get_status(file_hash: str) -> dict ``` **Parameters:** + - `file_hash` (str): The file hash returned from extract() **Returns:** + - `dict`: Status information including current processing state ##### retrieve() @@ -260,9 +263,11 @@ retrieve(file_hash: str) -> dict ``` **Parameters:** + - `file_hash` (str): The file hash of the completed job **Returns:** + - `dict`: Final processing results ##### wait_for_complete() @@ -278,14 +283,17 @@ wait_for_complete( ``` **Parameters:** + - `file_hash` (str): The file hash of the job to wait for - `timeout` (int): Maximum time to wait in seconds (default: 600) - `polling_interval` (int): Seconds between status checks (default: 3) **Returns:** + - `dict`: Final processing results when completed **Raises:** + - `ApiHubClientException`: If processing fails or times out ### Exception Handling @@ -313,7 +321,7 @@ from pathlib import Path def process_documents(file_paths, endpoint): results = [] - + for file_path in file_paths: try: print(f"Processing {file_path}...") @@ -324,7 +332,7 @@ def process_documents(file_paths, endpoint): sub_vertical=endpoint, file_path=file_path ) - + # Wait for completion with custom settings result = client.wait_for_complete( file_hash=initial_result["file_hash"], @@ -332,11 +340,11 @@ def process_documents(file_paths, endpoint): polling_interval=5 # Less frequent polling for batch ) results.append({"file": file_path, "result": result, "success": True}) - + except ApiHubClientException as e: print(f"Failed to process {file_path}: {e.message}") results.append({"file": file_path, "error": str(e), "success": False}) - + return results # Process multiple files @@ -378,7 +386,7 @@ For integration tests with a real API: cp .env.example .env # Edit .env with your API credentials -# Run integration tests +# Run integration tests pytest test/test_integration.py -v ``` @@ -412,6 +420,7 @@ This project uses automated releases through GitHub Actions with PyPI Trusted Pu 3. **Click "Run workflow"** - the automation handles the rest! The workflow will automatically: + - Update version in the code - Create Git tags and GitHub releases - Run all tests and quality checks @@ -430,17 +439,24 @@ We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) f git clone https://github.com/Zipstack/apihub-python-client.git cd apihub-python-client -# Install in development mode with all dependencies -pip install -e ".[dev]" +# Install dependencies using uv (required - do not use pip) +uv sync + +# Install pre-commit hooks +uv run --frozen pre-commit install # Run tests -pytest +uv run --frozen pytest + +# Run linting and formatting +uv run --frozen ruff check . +uv run --frozen ruff format . -# Run linting -ruff check . +# Run type checking +uv run --frozen mypy src/ -# Format code -ruff format . +# Run all pre-commit hooks manually +uv run --frozen pre-commit run --all-files ``` ## 📄 License @@ -456,6 +472,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## 📈 Version History ### v0.1.0 + - Initial release - Basic client functionality with extract, status, and retrieve operations - File upload support @@ -464,4 +481,4 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file --- -Made with ❤️ by the Unstract team \ No newline at end of file +Made with ❤️ by the Unstract team diff --git a/pyproject.toml b/pyproject.toml index a35904f..588910e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,12 @@ python_version = "3.12" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = true +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "test.*" +disallow_untyped_defs = false +ignore_errors = true [tool.pytest.ini_options] testpaths = ["test"] @@ -120,6 +126,10 @@ exclude_lines = [ "@(abc\\.)?abstractmethod", ] +[tool.bandit] +exclude_dirs = ["test", "tests", ".venv", "venv"] +skips = ["B101", "B601", "B113"] + [dependency-groups] dev = [ "bandit[toml]>=1.8.6", diff --git a/test/test_client.py b/test/test_client.py index 67f901a..ee3766f 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -5,7 +5,6 @@ import pytest import requests_mock - from apihub_client.client import ApiHubClient, ApiHubClientException diff --git a/test/test_integration.py b/test/test_integration.py index 9c913a0..311a65b 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -6,7 +6,6 @@ import pytest import requests_mock - from apihub_client.client import ApiHubClient, ApiHubClientException diff --git a/test/test_performance.py b/test/test_performance.py index 2fb91f7..1025e80 100644 --- a/test/test_performance.py +++ b/test/test_performance.py @@ -4,7 +4,6 @@ import pytest import requests_mock - from apihub_client.client import ApiHubClient From 9cceca37121697426d4a0cebdd4d640882c8d6d7 Mon Sep 17 00:00:00 2001 From: Jaseem Jas Date: Fri, 25 Jul 2025 15:15:11 +0530 Subject: [PATCH 2/2] fix: resolve import sorting issues in test files - Fixed I001 import block formatting errors in test_client.py - Fixed I001 import block formatting errors in test_integration.py - Fixed I001 import block formatting errors in test_performance.py - All imports now properly organized: stdlib, third-party, local - Added proper spacing between import groups - Resolves tox lint failures --- test/test_client.py | 1 + test/test_integration.py | 1 + test/test_performance.py | 1 + 3 files changed, 3 insertions(+) diff --git a/test/test_client.py b/test/test_client.py index ee3766f..67f901a 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -5,6 +5,7 @@ import pytest import requests_mock + from apihub_client.client import ApiHubClient, ApiHubClientException diff --git a/test/test_integration.py b/test/test_integration.py index 311a65b..9c913a0 100644 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -6,6 +6,7 @@ import pytest import requests_mock + from apihub_client.client import ApiHubClient, ApiHubClientException diff --git a/test/test_performance.py b/test/test_performance.py index 1025e80..2fb91f7 100644 --- a/test/test_performance.py +++ b/test/test_performance.py @@ -4,6 +4,7 @@ import pytest import requests_mock + from apihub_client.client import ApiHubClient