diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5dbd67d..362d4dd 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -9,6 +9,9 @@ on: pull_request: branches: [ master ] +permissions: + contents: read + jobs: build: runs-on: ubuntu-latest @@ -18,9 +21,9 @@ jobs: python-version: ["3.12", "3.13", "3.14"] steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 + uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6 with: python-version: ${{ matrix.python-version }} - name: Installing Workflow Dependencies diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 4a003d1..431c63a 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -10,11 +10,13 @@ permissions: jobs: release-build: runs-on: ubuntu-latest + permissions: + contents: read steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - - uses: actions/setup-python@v6 + - uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6 with: python-version: "3.12" @@ -24,7 +26,7 @@ jobs: python -m build - name: Upload Distributions - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: name: release-dists path: dist/ @@ -42,12 +44,12 @@ jobs: steps: - name: Retrieve Release Distributions - uses: actions/download-artifact@v8 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: name: release-dists path: dist/ - name: Publish Release Distributions to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # release/v1 with: packages-dir: dist/ diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml new file mode 100644 index 0000000..62631fa --- /dev/null +++ b/.github/workflows/security.yml @@ -0,0 +1,41 @@ +# This workflow runs static security analysis (bandit) and audits the +# installed dependency tree for known vulnerabilities (pip-audit). +# For more information see: +# https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Security Scan (bandit, pip-audit) + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +permissions: + contents: read + +jobs: + scan: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + - name: Setup Python 3.12 + uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6 + with: + python-version: "3.12" + - name: Install Package and Security Tooling + run: | + python -m pip install --upgrade pip + python -m pip install -e . + python -m pip install bandit pip-audit + - name: Static Security Analysis (bandit) + # ? scan the package source for common security anti-patterns; + # ? tests are excluded as fixtures (e.g. dummy tokens) trip + # ? false positives that are not production security concerns + run: | + bandit -r polyskills/ -x polyskills/tests + - name: Dependency Vulnerability Audit (pip-audit) + # ? fail the build on any known-vulnerable dependency + run: | + pip-audit diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8f5239e..25508b1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,9 +23,9 @@ jobs: python-version: ["3.12", "3.13", "3.14"] steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 + uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6 with: python-version: ${{ matrix.python-version }} - name: Install Package and Test Dependencies diff --git a/CHANGELOG.md b/CHANGELOG.md index 6feeaf9..134d37c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,119 @@ under `h3` tags, while the `micro` and "version identifiers" are listed under `h +### PolySkills v2.1.0 | 2026-06-27 + +The `v2.1.0` line is a security-hardening, robustness, and observability release. Born out of a +full audit of the remote download and extraction pipeline, it closes the supply-chain gaps that let +untrusted network traffic and crafted archives influence what landed on disk, hardens the CI/CD +pipeline against tag-mutation attacks, and adds an opt-out local tracking database so `polyskills` +can answer "what did I install, from where, into which directory, and at which commit?" across every +project and machine. The `v2.0.0` CLI surface is kept intact. + +> [!NOTE] +> **Behaviour changes (non-breaking API).** TLS verification is now **on by default** (it was +> effectively disabled before); input validation raises `polyskills.error` exceptions +> (`ValidationError` still subclasses `ValueError`); the CLI renders failures as a clean +> `[ERROR]` message with a non-zero exit instead of a traceback; and every `manager` fetch is +> recorded in `~/.polyskills/records.db` unless `--no-tracking` is passed. The sub-command surface +> and existing call sites are unchanged. + +#### 🎉 Major Features + + * **Local tracking database.** A new `polyskills.database` sub-package records every fetch + into `~/.polyskills/records.db` (resolved OS-independently via `Path.home()`). The schema + is third-normal-form across seven tables (`sources`, `libraries`, `remotes`, `extensions`, + `installations`, `environments`, `fetch_events`) mapped with the SQLAlchemy 2.0 ORM. + Derived facts - first-fetched, last-updated, current commit SHA - are computed from the + append-only event log rather than cached on any row. + * **`records` sub-command.** A read-only `polyskills records [--name ] [--db ]` + command lists every tracked installation or shows one extension's full fetch history - + install location, resolved commit SHA, and first-fetched / last-updated timestamps. It + never creates the database when it is absent. + +#### ✨ Feature Enhancements + + * **Configurable TLS verification.** Verification defaults to secure (`verify=True`, honouring + `REQUESTS_CA_BUNDLE` / `SSL_CERT_FILE`). Two new mutually exclusive flags tune it: `--no-verify` + disables it for a trusted network or behind a TLS-intercepting proxy, and `--request-cert` pins + the bundled `certifi` authorities for strict verification that fails closed when only an + interception certificate is available. + * **Domain exception hierarchy.** `polyskills.error` now defines `PolyskillsError` with + `ValidationError`, `RemoteError`, and `ExtractionError`, so callers can catch a specific failure + mode while a single `except PolyskillsError` still covers them all. + * **Identifiable client.** Every GitHub request now advertises a `polyskills/` + `User-Agent` header, and request timeouts are centralised as named constants. + * **`--no-tracking` flag.** `polyskills manager ... --no-tracking` skips the database write + for a single invocation; tracking is otherwise on by default. + * **`resolveCommit` on source managers.** `SourceManager.resolveCommit(remote, version)` + resolves a tag, branch, or SHA to its concrete commit SHA; `GithubManager` implements it via + the commits REST endpoint (honouring the configurable TLS policy) so the exact installed + commit is recorded. + * **Best-effort tracking.** Every tracking write is wrapped so a missing dependency, a + read-only home directory, or a locked database degrades to a single warning and never + alters the fetch outcome or the process exit status. + +#### 🛠️ Security Hardening & Fixes + + * **TLS no longer disabled.** The hard-coded `verify=False` on every remote request — a + man-in-the-middle exposure — is removed in favour of the configurable policy above; the + tracking database's `resolveCommit` request honours the same policy. + * **Archive path-traversal guard.** Each tarball member is resolved against the destination subtree + and rejected with `ExtractionError` if it escapes; non file/dir members (symlinks, devices) are + skipped on purpose. + * **Size caps.** The compressed download and the cumulative uncompressed size are bounded to defang + disk-exhaustion and decompression-bomb archives. + * **Validation hardened.** Argument checks raise exceptions instead of `assert` (which `python -O` + strips); URL path / ref segments are percent-encoded against query injection; and tag pagination + is capped to stop an endless `next`-link chain. + * **Clean CLI errors.** `main()` funnels expected failures into a concise `[ERROR]` message and a + non-zero exit; set `POLYSKILLS_DEBUG` to re-raise the full traceback. + * **Version & dependencies.** `__version__` is bumped to `v2.1.0`, the unused `packaging` runtime + dependency is dropped, and `sqlalchemy>=2.0` is added for the tracking database. + +#### 🧪 Testing & CI + + * **Hermetic-by-default suite.** The default `unittest discover` run is now fully offline; the + live-network tests are opt-in via `POLYSKILLS_RUN_LIVE=1` (with `POLYSKILLS_NO_VERIFY=1` for a + proxied network). + * **Security regression module (`test_security.py`).** Mocked-transport tests lock in TLS + forwarding, path-traversal rejection, the download / extraction size caps, the pagination cap, + and verify-flag resolution. + * **Offline database suite (`test_database.py`).** 18 hermetic cases cover path resolution, + schema and pragmas, cascade delete, event recording, strict normalisation, derived facts, + failure capture, graceful degradation, distinct-extension isolation, and the new CLI arguments. + * **Suite database redirect.** `tests/__init__.py` redirects `POLYSKILLS_DB_PATH` onto a + temporary file so no test ever reads or writes the real user database. + * **Supply-chain-hardened workflows.** Every GitHub Action is pinned to a commit SHA (the mutable + tag kept as a comment), a new `security.yml` runs `bandit` and `pip-audit`, and least-privilege + `permissions` are declared on the linting and release-build jobs. + +#### 💣 Code Refactoring & Internals + + * Superseded and removed the earlier stdlib-`sqlite3` tracker prototype in favour of the + SQLAlchemy `polyskills.database` package; its development history is retained as a reachable + git ancestor through the merge into this line. + +#### 📦 Installation + +```shell +$ pip install "polyskills==2.1.0" + +$ polyskills list https://github.com// skills --source extensions/skills + +$ polyskills manager https://github.com// \ + --name sql-code-format \ + --destination ~/.claude/skills/sql-code-format \ + --request-cert \ + skills # secure TLS, tracked + +$ polyskills manager https://github.com// \ + --name sql-code-format --no-tracking skills # skip tracking + +$ polyskills records # list tracked installs +$ polyskills records --name sql-code-format # full history of one +``` + ### PolySkills v2.0.0 | 2026-06-07 The `v2.0.0` line redesigns `polyskills` from a "skills-only fetcher" into a portable, multi-library diff --git a/README.md b/README.md index bf8e60e..de63430 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ default) of Python language and AI tools. ### 🧰 CLI Overview Once installed, the `polyskills` command is exposed on the system `PATH`. Use the `--help` flag at any level to discover -documentation for all sub-commands and flags. The CLI is organized around four sub-commands: +documentation for all sub-commands and flags. The CLI is organized around five sub-commands: | 🔖 Command | 🎯 Purpose | | :--- | :--- | @@ -87,6 +87,7 @@ documentation for all sub-commands and flags. The CLI is organized around four s | `sources` | List the supported remote source providers (e.g., GitHub) and exit. | | `list` | Enumerate available extensions under a remote `` directory, no download. | | `manager` | Fetch a single extension (`skills` / `agents`) from the remote into a local directory. | +| `records` | Inspect the local tracking database (install location, commit SHA, timestamps). | ```shell $ polyskills --help # top level documentation and sub-commands @@ -174,6 +175,44 @@ The `--pagination` flag (defaults to `100`, the GitHub maximum) tunes how many e during enumeration. The `--version` flag (defaults to `master`) pins the fetch to an exact tag or commit SHA so the extension content is reproducible across systems. +### 🗃️ Local Tracking Database + +Every `polyskills manager` fetch is recorded by default into a local SQLite database at `~/.polyskills/records.db` +(resolved OS-independently via `Path.home()`) so you can answer "what did I install, from where, into which directory, +and at which commit?" across projects and machines. The schema is strictly normalised and managed through the +*SQLAlchemy* ORM, and every write is **best-effort**: a missing dependency, a read-only home directory, or a locked +database degrades to a single warning and never breaks the fetch. Pass `--no-tracking` to skip the write for a single +invocation. + +```shell +$ polyskills manager https://github.com/PyUtility/polyskills \ + --name sql-code-format \ + --destination ~/.claude/skills/sql-code-format \ + skills # recorded in ~/.polyskills/records.db + +$ polyskills manager https://github.com/PyUtility/polyskills \ + --name sql-code-format --no-tracking skills # not recorded +``` + +Use the read-only `records` sub-command to inspect what has been tracked. Without `--name` it lists every tracked +installation; with `--name` it shows one extension's full fetch history (install location, resolved commit SHA, and +first-fetched / last-updated timestamps). The command never creates the database when it is absent. + +```shell +$ polyskills records +>> Tracked installations (1): +>> >> 01. sql-code-format [skills] +>> remote : https://github.com/PyUtility/polyskills (extensions/skills) +>> path : /home/user/.claude/skills/sql-code-format +>> commit : 5b23f55de9a1... (requested master) +>> history : first 2026-06-27 12:53:52 | last 2026-06-27 12:53:52 | 1 fetch(es), last success + +$ polyskills records --name sql-code-format # full per-fetch history of one extension +``` + +The same data is available through the Python API - `record_fetch`, `list_installations`, and `get_installation` +in `polyskills.database` - so tracking can be embedded in automation without the CLI. + ### 🐍 Programmatic Usage In addition to the CLI, every primitive is exposed as a Python API so the same workflow can be embedded inside automation diff --git a/polyskills/__init__.py b/polyskills/__init__.py index ee87dc7..e3900dd 100644 --- a/polyskills/__init__.py +++ b/polyskills/__init__.py @@ -14,6 +14,6 @@ providing one source of truth across different system and projects. """ -__version__ = "v2.0.0" +__version__ = "v2.1.0" # init-time options registrations, use api/ for public functions diff --git a/polyskills/cli.py b/polyskills/cli.py index 3bc38cb..6a4ee53 100644 --- a/polyskills/cli.py +++ b/polyskills/cli.py @@ -20,10 +20,21 @@ import os import sys +import time import argparse from pathlib import Path -from typing import Callable, Optional, Union +from typing import Any, Callable, Dict, Optional, Tuple, Union + +from polyskills.apps.tools import SupportedTools +from polyskills.error.exceptions import ValidationError +from polyskills.remote.sources import ( + GithubManager, SourceControl, SourceManager, ValidSources +) +from polyskills.database import ( + default_database_path, get_installation, list_installations, + record_fetch, set_invocation_context +) def _expandUserPath(value : Union[str, Path]) -> Path: @@ -55,10 +66,59 @@ def _expandUserPath(value : Union[str, Path]) -> Path: expanded = os.path.expandvars(os.fspath(value)) return Path(expanded).expanduser() -from polyskills.apps.tools import SupportedTools -from polyskills.remote.sources import ( - GithubManager, SourceControl, SourceManager, ValidSources -) + +def _resolveVerify( + no_verify : bool = False, request_cert : bool = False +) -> Union[bool, str]: + """ + Resolve the ``verify`` value forwarded to every remote request + from the mutually exclusive ``--no-verify`` / ``--request-cert`` + command line flags. + + The default (neither flag) returns ``True`` so :mod:`requests` + uses its standard trust store while still honouring the + ``REQUESTS_CA_BUNDLE`` / ``SSL_CERT_FILE`` environment overrides + that are common behind a corporate proxy. ``--no-verify`` returns + ``False`` to disable verification, while ``--request-cert`` pins + the bundled :mod:`certifi` authorities for strict verification + that deliberately fails when only an intercepting certificate is + available. + + :type no_verify: bool + :param no_verify: Disable verification entirely when ``True``. + + :type request_cert: bool + :param request_cert: Pin the bundled trust store when ``True``. + + :raises ValidationError: If both flags are requested together, or + the :mod:`certifi` bundle cannot be located for strict mode. + + :rtype: Union[bool, str] + :returns: ``True`` / ``False`` or a path to a CA bundle, suitable + for the ``verify`` keyword of :func:`requests.get`. + """ + + if no_verify and request_cert: + raise ValidationError( + "Cannot use --no-verify together with --request-cert." + ) + + if no_verify: + return False + + if request_cert: + try: + import certifi + except ImportError as error: + raise ValidationError( + "Strict --request-cert mode needs the 'certifi' " + "package, which could not be imported." + ) from error + + return certifi.where() + + return True + # ? registry mapping each supported remote enumeration to its concrete # ? :class:`SourceManager` factory; extending the framework with a new @@ -68,6 +128,42 @@ def _expandUserPath(value : Union[str, Path]) -> Path: } +def _summariseDestination(destination : Path) -> Tuple[int, int]: + """ + Walk ``destination`` and return ``(file_count, total_bytes)`` so a + successful fetch can be recorded with its on-disk footprint. + + The helper is best-effort: any :class:`OSError` raised while + iterating the tree is swallowed and the partial counts gathered so + far are returned, preserving the "tracking is never fatal" + contract of the command-line interface. + + :type destination: pathlib.Path + :param destination: Directory whose files are counted and sized. + + :rtype: Tuple[int, int] + :returns: A two-tuple ``(file_count, total_bytes)`` of the regular + files found beneath ``destination``. + """ + + file_count = 0 + total_bytes = 0 + + try: + for path in Path(destination).rglob("*"): + if not path.is_file(): + continue + file_count += 1 + try: + total_bytes += path.stat().st_size + except OSError: + continue + except OSError: + pass + + return file_count, total_bytes + + def buildParser() -> argparse.ArgumentParser: """ Constructs the :func:`argparse.parser` for the :mod:`polyskills` @@ -156,6 +252,25 @@ def buildRemoteControls() -> argparse.ArgumentParser: ) ) + # ? mutually exclusive TLS verification controls. The default + # ? (neither flag) keeps secure verification while honouring + # ? REQUESTS_CA_BUNDLE / SSL_CERT_FILE for proxied networks. + tlsControls = remoteCommon.add_mutually_exclusive_group() + tlsControls.add_argument( + "--no-verify", action = "store_true", help = ( + "Disable TLS certificate verification for every remote " + "request. Discouraged - use only on a trusted network " + "or behind a TLS-intercepting corporate proxy." + ) + ) + tlsControls.add_argument( + "--request-cert", action = "store_true", help = ( + "Enforce strict TLS verification against the bundled " + "certificate authorities, ignoring environment " + "overrides. Fails if a trusted certificate is absent." + ) + ) + return remoteCommon # ? Creating Subparsers:: SOURCES - List Available Sources @@ -194,6 +309,36 @@ def buildRemoteControls() -> argparse.ArgumentParser: ]) ) + # ? Creating Subparsers:: RECORDS - Inspect the Tracking Database + records = subparser.add_parser( + "records", help = ( + "Inspect the local tracking database " + "(~/.polyskills/records.db) that records every extension " + "fetched on this system - where it is installed, the " + "resolved commit SHA, and when it was first fetched and " + "last updated. Without '--name' every tracked installation " + "is listed; with '--name' the full fetch history of the " + "matching extension is shown. This command is read-only " + "and never creates the database." + ) + ) + + records.add_argument( + "-n", "--name", type = str, default = None, metavar = "", help = ( + "Show the detailed fetch history for the extension with " + "this leaf name (e.g., 'sql-code-format') instead of the " + "summary listing of every tracked installation." + ) + ) + + records.add_argument( + "--db", type = str, default = None, metavar = "", help = ( + "Path to the tracking database to read, defaults to " + "'~/.polyskills/records.db'. Primarily an advanced or " + "testing affordance." + ) + ) + # shared remote arguments across sub-parsers remoteControls = buildRemoteControls() @@ -208,7 +353,8 @@ def buildRemoteControls() -> argparse.ArgumentParser: "list", parents = [remoteControls], usage = ( "polyskills list [-h] [-s SOURCE] [--pagination PAGINATION] " - "[--token TOKEN] [--version VERSION] remote LIBRARY" + "[--token TOKEN] [--version VERSION] " + "[--no-verify | --request-cert] remote LIBRARY" ), help = ( "List the available extensions (skills, agents, etc.) " @@ -239,7 +385,8 @@ def buildRemoteControls() -> argparse.ArgumentParser: usage = ( "polyskills manager [-h] -n NAME [--exists {fail,overwrite,merge}] " "[-d DESTINATION] [-s SOURCE] [--pagination PAGINATION] " - "[--token TOKEN] [--version VERSION] remote LIBRARY ..." + "[--token TOKEN] [--version VERSION] " + "[--no-verify | --request-cert] remote LIBRARY ..." ), help = ( "Main method to manage skills, agents, etc. for a LLM " @@ -281,6 +428,18 @@ def buildRemoteControls() -> argparse.ArgumentParser: ) ) + manager.add_argument( + "--no-tracking", action = "store_true", default = False, + help = ( + "Disable recording this installation in the local tracking " + "database (~/.polyskills/records.db). By default every " + "fetch - successful or failed - is tracked so the install " + "location, resolved commit SHA, and first-fetched / " + "last-updated timestamps are retained; pass this flag to " + "skip the database write entirely for this invocation." + ) + ) + # ? Create a Sub-Parser for the ``manager`` Parser for Libraries # ? {``skills``, ``agents``, ...} for different actions libraries = manager.add_subparsers( @@ -325,7 +484,7 @@ def _resolveManager( new providers light up automatically once they are added to the :data:`_SOURCE_MANAGERS` registry above. - :raises ValueError: If ``remote`` does not match any of the + :raises ValidationError: If ``remote`` does not match any of the registered remote URL patterns. """ @@ -335,7 +494,7 @@ def _resolveManager( return candidate supported = ", ".join(member.value for member in _SOURCE_MANAGERS) - raise ValueError( + raise ValidationError( f"Remote `{remote}` is not supported. " f"Supported sources: {supported}" ) @@ -394,13 +553,231 @@ def listExtensions( ) -def main() -> None: +def _recordFetch( + args : argparse.Namespace, control : SourceControl, + status : str, error : Optional[str], duration_ms : int +) -> None: """ - Entry point for CLI tool for :mod:`polyskills` that parses the - command line arguments using :mod:`argparse` module. + Record one ``manager`` fetch attempt in the tracking database. + + The helper resolves the remote slug, the provider metadata, the + resolved commit SHA, and the on-disk footprint, then delegates to + :func:`polyskills.database.record_fetch`. It is strictly + best-effort: every step is wrapped so a tracking failure - a + network error while resolving the SHA, an unreadable destination, + or a database fault - never propagates into the CLI exit path. + + :type args: argparse.Namespace + :param args: Parsed ``manager`` namespace carrying the remote, + name, library, source, destination, version, and exists policy. + + :type control: SourceControl + :param control: Source control built from ``--pagination`` and + ``--token``, reused to resolve the provider and commit SHA. + + :type status: str + :param status: Either ``"success"`` or ``"failed"``. + + :type error: Optional[str] + :param error: Error description recorded on a failed attempt. + + :type duration_ms: int + :param duration_ms: Wall-clock duration of the attempt in + milliseconds. + + :rtype: None + :returns: Nothing - the row is written as a best-effort side + effect; failures are swallowed. """ - args = buildParser().parse_args() + try: + manager = _resolveManager(remote = args.remote, control = control) + owner, repository = manager.getSlug(remote = args.remote) + + commit_sha : Optional[str] = None + file_count : Optional[int] = None + total_bytes : Optional[int] = None + + if status == "success": + commit_sha = manager.resolveCommit(args.remote, args.version) + file_count, total_bytes = _summariseDestination( + args.destination + ) + + action = "fetch" if args.exists == "fail" else args.exists + + record_fetch( + source_name = manager.source.name, + source_base_url = manager.source.value, + owner = owner, repository = repository, + remote_url = args.remote, + library = args.library, name = args.name, + source_dir = Path(str(args.source)).as_posix(), + install_path = str(Path(args.destination).resolve()), + requested_version = args.version, + action = action, status = status, + resolved_commit_sha = commit_sha, + file_count = file_count, total_bytes = total_bytes, + duration_ms = duration_ms, error = error, + ) + except Exception: # nosec B110 - intentional best-effort swallow + # ! tracking is strictly best-effort; a failure here must never + # ! break the install path or alter the CLI exit status. + pass + + +def _formatTimestamp(value : Any) -> str: + """ + Render a stored timestamp for display, or ``n/a`` when absent. + + :type value: Any + :param value: A :class:`datetime.datetime` (or ``None``) read back + from the tracking database. + + :rtype: str + :returns: A ``YYYY-MM-DD HH:MM:SS`` string, ``n/a`` for ``None``, + or the raw string form for any unexpected value. + """ + + if value is None: + return "n/a" + try: + return value.strftime("%Y-%m-%d %H:%M:%S") + except (AttributeError, ValueError): + return str(value) + + +def _formatRecordSummary(index : int, row : Dict[str, Any]) -> str: + """ + Render one installation summary block for the ``records`` listing. + + :type index: int + :param index: Zero-based position used to build the display number. + + :type row: Dict[str, Any] + :param row: One summary mapping from + :func:`polyskills.database.list_installations`. + + :rtype: str + :returns: A multi-line, indented summary block for the + installation. + """ + + number = str(index + 1).zfill(2) + sha = row.get("resolved_commit_sha") or "n/a" + + return "\n".join([ + f" >> {number}. {row['name']} [{row['library']}]", + f" remote : {row['remote_url']} ({row['source_dir']})", + f" path : {row['install_path']}", + f" commit : {sha} (requested {row['requested_version']})", + f" history : first {_formatTimestamp(row['first_fetched_at'])}" + f" | last {_formatTimestamp(row['last_updated_at'])}" + f" | {row['fetch_count']} fetch(es), last {row['last_status']}", + ]) + + +def _formatRecordDetail(detail : Dict[str, Any]) -> str: + """ + Render a detailed installation block with its full event history. + + :type detail: Dict[str, Any] + :param detail: One detail mapping from + :func:`polyskills.database.get_installation`, including its + chronological ``events`` list. + + :rtype: str + :returns: A multi-line block describing the installation and every + recorded fetch attempt against it. + """ + + lines = [ + f">> {detail['name']} [{detail['library']}]", + f" remote : {detail['remote_url']} ({detail['source_dir']})", + f" path : {detail['install_path']}", + f" commit : {detail.get('resolved_commit_sha') or 'n/a'}", + f" first : {_formatTimestamp(detail['first_fetched_at'])}", + f" last : {_formatTimestamp(detail['last_updated_at'])}", + f" events ({len(detail['events'])}):", + ] + + for event in detail["events"]: + sha = event.get("resolved_commit_sha") or "n/a" + lines.append( + f" - {_formatTimestamp(event['occurred_at'])}" + f" {event['status']:7} {event['action']:9}" + f" {event['requested_version']} -> {sha}" + f" ({event['invoked_via']})" + ) + if event.get("error"): + lines.append(f" error: {event['error']}") + + return "\n".join(lines) + + +def _showRecords(name : Optional[str], db : Optional[str]) -> None: + """ + Render the contents of the tracking database to the terminal. + + Without ``name`` a per-installation summary is printed; with + ``name`` the full chronological fetch history of every matching + installation is shown. The command is read-only: when the database + file does not exist it reports the fact and returns without + creating it. + + :type name: Optional[str] + :param name: Extension leaf name to detail, or ``None`` to list + every tracked installation. + + :type db: Optional[str] + :param db: Explicit database path, or ``None`` for the canonical + ``~/.polyskills/records.db``. + + :rtype: None + :returns: Nothing - the records are printed as a side effect. + """ + + database_path = _expandUserPath(db) if db else default_database_path() + + if not database_path.exists(): + print(f"No tracking database found at `{database_path}`.") + print("Fetch an extension with 'polyskills manager ...' first.") + return None + + if name: + details = get_installation(name, database_path = database_path) + if not details: + print(f"No tracked installation found for `{name}`.") + return None + + for detail in details: + print(_formatRecordDetail(detail)) + return None + + rows = list_installations(database_path = database_path) + if not rows: + print(f"No tracked installations in `{database_path}`.") + return None + + print(f"Tracked installations ({len(rows)}):") + for idx, row in enumerate(rows): + print(_formatRecordSummary(idx, row)) + return None + + +def _dispatch(args : argparse.Namespace) -> None: + """ + Execute the parsed CLI command. + + Separated from :func:`main` so that every transport or filesystem + error raised while contacting a remote bubbles up to a single + top-level handler that renders a concise message instead of a raw + traceback. Set ``POLYSKILLS_DEBUG`` in the environment to re-raise + the original exception for debugging. + + :type args: argparse.Namespace + :param args: Parsed command line arguments from :func:`buildParser`. + """ # ? terminal commands: 'sources', 'tools' - print and exit early # ? these subparsers attach a ``func`` default; manager does not. @@ -410,13 +787,34 @@ def main() -> None: print(output) return None + # ? dispatch the 'records' subcommand: read-only inspection of the + # ? local tracking database. It contacts no remote and carries no + # ? TLS flags, so it is handled before TLS verification is resolved. + if args.command == "records": + _showRecords(name = args.name, db = args.db) + return None + + # ? resolve TLS verification shared by the 'list' and 'manager' + # ? subcommands before any remote request leaves the process + verify = _resolveVerify(args.no_verify, args.request_cert) + if args.no_verify: + import urllib3 + urllib3.disable_warnings( + urllib3.exceptions.InsecureRequestWarning + ) + print( + "[WARNING] TLS verification is disabled (--no-verify).", + file = sys.stderr + ) + # ? dispatch the 'list' subcommand: enumerate extensions and exit if args.command == "list": source = _expandUserPath( args.source or f"./{args.library}" ).as_posix() control = SourceControl( - pagination = args.pagination, token = args.token + pagination = args.pagination, token = args.token, + verify = verify ) names = listExtensions( @@ -453,17 +851,66 @@ def main() -> None: # ? dispatch to the concrete remote manager via the wrapper above control = SourceControl( - pagination = args.pagination, token = args.token + pagination = args.pagination, token = args.token, + verify = verify ) - get( - remote = args.remote, name = args.name, - source = Path(args.source), destination = args.destination, - version = args.version, exists = args.exists, - control = control, - ) + # ? record every fetch in the local tracking database unless the + # ? caller opted out with ``--no-tracking``; the recording is + # ? wrapped so a database fault never alters the fetch outcome or + # ? the process exit status. + tracking = not getattr(args, "no_tracking", False) + if tracking: + set_invocation_context("cli") + + started = time.monotonic() + try: + get( + remote = args.remote, name = args.name, + source = Path(args.source), destination = args.destination, + version = args.version, exists = args.exists, + control = control, + ) + except Exception as exc: + if tracking: + _recordFetch( + args, control, status = "failed", error = repr(exc), + duration_ms = int((time.monotonic() - started) * 1000), + ) + raise + else: + if tracking: + _recordFetch( + args, control, status = "success", error = None, + duration_ms = int((time.monotonic() - started) * 1000), + ) return None + +def main() -> None: + """ + Entry point for the :mod:`polyskills` CLI. Parses the command line + arguments and dispatches to :func:`_dispatch`, funnelling expected + runtime failures into a clean, non-zero exit rather than a raw + traceback. Set ``POLYSKILLS_DEBUG`` to surface the full traceback. + """ + + args = buildParser().parse_args() + + try: + return _dispatch(args) + except KeyboardInterrupt: + print( + "\n[ABORTED] Interrupted by the user.", file = sys.stderr + ) + raise SystemExit(130) + except Exception as error: + if os.environ.get("POLYSKILLS_DEBUG"): + raise + print(f"[ERROR] {error}", file = sys.stderr) + raise SystemExit(1) + + if __name__ == "__main__": sys.exit(main()) diff --git a/polyskills/database/__init__.py b/polyskills/database/__init__.py new file mode 100644 index 0000000..c5a123b --- /dev/null +++ b/polyskills/database/__init__.py @@ -0,0 +1,65 @@ +# -*- encoding: utf-8 -*- + +""" +Local Tracking Database For The Polyskills Framework +==================================================== + +The :mod:`polyskills.database` sub-package owns the local SQLite +tracking database that records every extension (a skill, an agent, +etc.) fetched and flushed into a destination directory by the CLI +tool or a direct Python API caller. + +The database lives at a fixed, operating-system independent location +under the user home directory: + + ``~/.polyskills/records.db`` + +Persistence is implemented with the SQLAlchemy Object Relational +Mapper against a strictly normalised relational schema (see +:mod:`polyskills.database.models`) so the recorded facts - which +remote an extension came from, where it was installed, the resolved +commit SHA, and the first-fetched / last-updated timestamps - can be +queried without duplicated, denormalised columns. + +Every write performed through :mod:`polyskills.database.tracker` is +best-effort: a missing dependency, a read-only home directory, a +locked database, or any other fault is swallowed and surfaced as a +single per-process warning so the install path of the framework is +never broken by a tracking failure. + +:NOTE: SQLAlchemy is a declared runtime dependency, but its presence + is still probed at import time and exposed through + :data:`SQLALCHEMY_AVAILABLE`. When the dependency is absent every + tracking entry point degrades to a silent no-op instead of + raising :class:`ImportError`. +""" + +try: + import sqlalchemy as _sqlalchemy # noqa: F401 +except ImportError: # pragma: no cover - exercised only without the dep + SQLALCHEMY_AVAILABLE = False +else: + SQLALCHEMY_AVAILABLE = True + +from polyskills.database.paths import ( + DATABASE_FILENAME, ENVIRON_DB_PATH, POLYSKILLS_HOME, + default_database_path +) +from polyskills.database.tracker import ( + get_installation, get_invocation_context, list_installations, + record_fetch, reset_cache_for_tests, set_invocation_context +) + +__all__ = [ + "SQLALCHEMY_AVAILABLE", + "DATABASE_FILENAME", + "ENVIRON_DB_PATH", + "POLYSKILLS_HOME", + "default_database_path", + "get_installation", + "get_invocation_context", + "list_installations", + "record_fetch", + "reset_cache_for_tests", + "set_invocation_context", +] diff --git a/polyskills/database/engine.py b/polyskills/database/engine.py new file mode 100644 index 0000000..b30abcc --- /dev/null +++ b/polyskills/database/engine.py @@ -0,0 +1,169 @@ +# -*- encoding: utf-8 -*- + +""" +Engine And Schema Bootstrap For The Polyskills Tracking Database +================================================================ + +Owns the construction of the SQLAlchemy :class:`~sqlalchemy.engine.Engine` +bound to the SQLite tracking database and the idempotent +materialisation of the schema defined in +:mod:`polyskills.database.models`. + +The module centralises three SQLite-specific concerns that the rest of +the package should not have to know about: + + * the operating-system independent ``sqlite:///`` URL is built from + a :class:`pathlib.Path` via its POSIX representation so a Windows + drive-letter path does not break the URL grammar; + * the parent directory of the database file is created on demand so + a first-ever write never fails on a missing ``~/.polyskills``; + * connection-level ``PRAGMA`` statements (foreign-key enforcement, + write-ahead logging, a busy timeout) are applied to every pooled + connection through a ``connect`` event listener. + +:NOTE: This module imports SQLAlchemy unconditionally and is therefore + only imported lazily by :mod:`polyskills.database.tracker` once the + :data:`polyskills.database.SQLALCHEMY_AVAILABLE` guard has passed. +""" + +from pathlib import Path + +from typing import Optional, Tuple + +from sqlalchemy import create_engine, event +from sqlalchemy.engine import Engine +from sqlalchemy.orm import sessionmaker + +from polyskills.database.models import Base +from polyskills.database.paths import default_database_path + +# ? connection-level pragmas applied to every SQLite connection handed +# ? out of the pool; foreign keys must be enabled explicitly on SQLite +# ? for the ``ON DELETE CASCADE`` on fetch_events to take effect. +_SQLITE_PRAGMAS : Tuple[str, ...] = ( + "PRAGMA foreign_keys = ON", + "PRAGMA journal_mode = WAL", + "PRAGMA synchronous = NORMAL", + "PRAGMA busy_timeout = 5000", +) + + +def _apply_sqlite_pragmas(dbapi_connection, connection_record) -> None: + """ + Apply the package :data:`_SQLITE_PRAGMAS` to a freshly opened + DB-API connection. + + The function is registered as a ``connect`` event listener by + :func:`create_database_engine` so the pragmas are re-applied to + every connection the engine pool produces, not merely the first. + + :type dbapi_connection: sqlite3.Connection + :param dbapi_connection: The raw DB-API connection SQLAlchemy has + just opened. + + :type connection_record: sqlalchemy.pool._ConnectionRecord + :param connection_record: The pool bookkeeping record for the + connection; unused but required by the listener signature. + + :rtype: None + :returns: Nothing - the pragmas are applied as a side effect on + ``dbapi_connection``. + """ + + cursor = dbapi_connection.cursor() + try: + for pragma in _SQLITE_PRAGMAS: + cursor.execute(pragma) + finally: + cursor.close() + + +def create_database_engine( + database_path : Optional[Path] = None +) -> Engine: + """ + Build a SQLAlchemy :class:`~sqlalchemy.engine.Engine` bound to the + SQLite tracking database. + + The parent directory of the database file is created on demand so + the very first write does not fail on a missing ``~/.polyskills`` + directory. A ``connect`` event listener is attached so every + connection produced by the engine pool has the package pragmas + (foreign keys, WAL, busy timeout) applied. + + :type database_path: Optional[pathlib.Path] + :param database_path: Explicit database location. When omitted the + canonical :func:`polyskills.database.paths.default_database_path` + location (``~/.polyskills/records.db``) is used. + + :rtype: sqlalchemy.engine.Engine + :returns: A configured engine ready for schema materialisation and + session binding. The engine is created lazily and opens no + connection until first use. + """ + + path = database_path if database_path is not None \ + else default_database_path() + path.parent.mkdir(parents = True, exist_ok = True) + + engine = create_engine( + f"sqlite:///{path.as_posix()}", future = True + ) + event.listen(engine, "connect", _apply_sqlite_pragmas) + + return engine + + +def init_schema(engine : Engine) -> None: + """ + Materialise every table of the tracking schema on ``engine`` if it + is not already present. + + The call delegates to :meth:`sqlalchemy.schema.MetaData.create_all` + which issues ``CREATE TABLE IF NOT EXISTS`` for every model, so the + function is safe to call on every process start. + + :type engine: sqlalchemy.engine.Engine + :param engine: The engine whose bound database should receive the + schema. + + :rtype: None + :returns: Nothing - the schema is created as a side effect on the + database bound to ``engine``. + """ + + Base.metadata.create_all(engine) + + +def build_session_factory( + database_path : Optional[Path] = None +) -> Tuple[Engine, sessionmaker]: + """ + Construct an engine, materialise the schema, and return both the + engine and a bound :class:`~sqlalchemy.orm.sessionmaker`. + + This is the single entry point callers need: it composes + :func:`create_database_engine` and :func:`init_schema` so a caller + can open a ready-to-use :class:`~sqlalchemy.orm.Session` without + repeating the bootstrap. The engine is returned alongside the + factory so the caller may dispose of it explicitly when finished. + + :type database_path: Optional[pathlib.Path] + :param database_path: Explicit database location, forwarded to + :func:`create_database_engine`. Defaults to the canonical + ``~/.polyskills/records.db`` location when omitted. + + :rtype: Tuple[sqlalchemy.engine.Engine, sqlalchemy.orm.sessionmaker] + :returns: A two-tuple ``(engine, session_factory)`` where the + factory produces :class:`~sqlalchemy.orm.Session` objects bound + to the engine. + """ + + engine = create_database_engine(database_path) + init_schema(engine) + + factory = sessionmaker( + bind = engine, future = True, expire_on_commit = False + ) + + return engine, factory diff --git a/polyskills/database/models.py b/polyskills/database/models.py new file mode 100644 index 0000000..21f0e16 --- /dev/null +++ b/polyskills/database/models.py @@ -0,0 +1,378 @@ +# -*- encoding: utf-8 -*- + +""" +SQLAlchemy ORM Models For The Polyskills Tracking Database +========================================================== + +Defines the strictly normalised relational schema that backs the +``~/.polyskills/records.db`` tracking database. The schema is modelled +with the SQLAlchemy 2.0 typed declarative mapping (``Mapped`` / +``mapped_column``) and is decomposed to third normal form so that no +recorded fact is duplicated across rows: + + ``sources`` -> ``remotes`` -> ``extensions`` -> ``installations`` + -> ``fetch_events`` <- ``environments`` + ``libraries`` -> ``extensions`` + +The seven tables are: + + * :class:`Source` - lookup of supported remote providers, one + row per :class:`polyskills.remote.sources.ValidSources` member. + * :class:`Library` - lookup of extension kinds (``skills``, + ``agents``, ``commands``, ``hooks``). + * :class:`Remote` - one row per ``(provider, owner, repo)``. + * :class:`Extension` - one row per logical extension on a remote, + keyed by ``(remote, library, name, source_dir)``. + * :class:`Installation`- one row per physical install location on + this machine, keyed by the absolute destination path. + * :class:`Environment` - lookup of the host environment a fetch ran + in, keyed by ``(hostname, platform, python, polyskills)``. + * :class:`FetchEvent` - append-only audit row, one per fetch + attempt (success or failure), carrying the resolved commit SHA, + timestamp, and on-disk size of that attempt. + +Derived facts are never stored: the "first fetched" and "last +updated" timestamps and the "current commit SHA" of an installation +are computed from :class:`FetchEvent` rows via aggregation, keeping +the schema in third normal form. + +:NOTE: This module imports SQLAlchemy unconditionally. It is only + imported lazily by :mod:`polyskills.database.tracker` after the + :data:`polyskills.database.SQLALCHEMY_AVAILABLE` guard has + confirmed the dependency is importable. +""" + +from datetime import datetime, timezone + +from typing import List, Optional + +from sqlalchemy import ( + DateTime, ForeignKey, Integer, String, Text, UniqueConstraint +) +from sqlalchemy.orm import ( + DeclarativeBase, Mapped, mapped_column, relationship +) + + +def _utc_now() -> datetime: + """ + Return the current UTC time as a naive :class:`datetime.datetime`. + + The value is the current instant in UTC with its ``tzinfo`` removed. + SQLite has no timezone-aware storage, so a stored timestamp always + reads back naive; producing a naive UTC value here keeps a freshly + recorded event and a reloaded event directly comparable and + sortable. Every audit timestamp in the database is therefore naive + UTC by convention. + + :rtype: datetime.datetime + :returns: The current UTC instant as a naive ``datetime``. + """ + + return datetime.now(timezone.utc).replace(tzinfo = None) + + +class Base(DeclarativeBase): + """ + Declarative base shared by every model in the tracking schema. + + A dedicated subclass of :class:`sqlalchemy.orm.DeclarativeBase` + is used (rather than the legacy ``declarative_base()`` factory) + so the schema participates in SQLAlchemy 2.0 typed mappings and + its :attr:`metadata` can be materialised in one call by + :func:`polyskills.database.engine.init_schema`. + """ + + +class Source(Base): + """ + Lookup table of supported remote providers. + + Each row corresponds to one + :class:`polyskills.remote.sources.ValidSources` member (for + example ``GITHUB``) and exists so that :class:`Remote` rows + reference a provider by foreign key rather than repeating the + provider name and base URL on every remote. + """ + + __tablename__ = "sources" + + id : Mapped[int] = mapped_column(primary_key = True) + name : Mapped[str] = mapped_column( + String(64), unique = True, nullable = False + ) + base_url : Mapped[str] = mapped_column(String(255), nullable = False) + + remotes : Mapped[List["Remote"]] = relationship( + back_populates = "source" + ) + + +class Library(Base): + """ + Lookup table of extension kinds managed by the framework. + + The supported values mirror the CLI library selector - + ``skills``, ``agents``, ``commands``, ``hooks`` - and are + referenced by :class:`Extension` rows through a foreign key so the + kind is never repeated as a free-text column. + """ + + __tablename__ = "libraries" + + id : Mapped[int] = mapped_column(primary_key = True) + name : Mapped[str] = mapped_column( + String(32), unique = True, nullable = False + ) + + extensions : Mapped[List["Extension"]] = relationship( + back_populates = "library" + ) + + +class Remote(Base): + """ + A single remote repository identified by its provider, owner, and + repository slug. + + The natural key ``(source_id, owner, repository)`` is enforced by + a unique constraint so re-fetching from the same repository reuses + the existing row instead of inserting a duplicate. + + :type source_id: int + :param source_id: Foreign key into :class:`Source`. + + :type owner: str + :param owner: Repository owner slug, for example ``PyUtility``. + + :type repository: str + :param repository: Repository name slug, for example + ``polyskills``. + + :type url: str + :param url: The canonical remote URL exactly as supplied on the + command line. + """ + + __tablename__ = "remotes" + __table_args__ = ( + UniqueConstraint( + "source_id", "owner", "repository", + name = "uq_remote_provider_slug" + ), + ) + + id : Mapped[int] = mapped_column(primary_key = True) + source_id : Mapped[int] = mapped_column( + ForeignKey("sources.id"), nullable = False, index = True + ) + owner : Mapped[str] = mapped_column(String(255), nullable = False) + repository : Mapped[str] = mapped_column(String(255), nullable = False) + url : Mapped[str] = mapped_column(Text, nullable = False) + + source : Mapped["Source"] = relationship(back_populates = "remotes") + extensions : Mapped[List["Extension"]] = relationship( + back_populates = "remote" + ) + + +class Extension(Base): + """ + A logical extension hosted on a remote repository. + + The natural key ``(remote_id, library_id, name, source_dir)`` + uniquely identifies an extension and is enforced by a unique + constraint. The same extension installed into several destinations + is represented by one :class:`Extension` row and many + :class:`Installation` rows. + + :type remote_id: int + :param remote_id: Foreign key into :class:`Remote`. + + :type library_id: int + :param library_id: Foreign key into :class:`Library`. + + :type name: str + :param name: Leaf directory name of the extension on the remote. + + :type source_dir: str + :param source_dir: POSIX-style source directory on the remote the + extension was fetched from. + """ + + __tablename__ = "extensions" + __table_args__ = ( + UniqueConstraint( + "remote_id", "library_id", "name", "source_dir", + name = "uq_extension_natural_key" + ), + ) + + id : Mapped[int] = mapped_column(primary_key = True) + remote_id : Mapped[int] = mapped_column( + ForeignKey("remotes.id"), nullable = False, index = True + ) + library_id : Mapped[int] = mapped_column( + ForeignKey("libraries.id"), nullable = False, index = True + ) + name : Mapped[str] = mapped_column( + String(255), nullable = False, index = True + ) + source_dir : Mapped[str] = mapped_column(String(512), nullable = False) + + remote : Mapped["Remote"] = relationship( + back_populates = "extensions" + ) + library : Mapped["Library"] = relationship( + back_populates = "extensions" + ) + installations : Mapped[List["Installation"]] = relationship( + back_populates = "extension" + ) + + +class Installation(Base): + """ + A physical installation of an extension at one absolute path on + the local filesystem. + + The natural key is the pair ``(extension_id, install_path)``: the + same extension re-fetched into the same directory reuses its + installation row (appending a new :class:`FetchEvent`), while a + different extension fetched into the same path gets its own + installation so the two histories are never conflated. + + :type extension_id: int + :param extension_id: Foreign key into :class:`Extension`. + + :type install_path: str + :param install_path: Absolute, resolved destination directory on + the local filesystem. + """ + + __tablename__ = "installations" + __table_args__ = ( + UniqueConstraint( + "extension_id", "install_path", + name = "uq_installation_extension_path" + ), + ) + + id : Mapped[int] = mapped_column(primary_key = True) + extension_id : Mapped[int] = mapped_column( + ForeignKey("extensions.id"), nullable = False, index = True + ) + install_path : Mapped[str] = mapped_column( + Text, nullable = False, index = True + ) + + extension : Mapped["Extension"] = relationship( + back_populates = "installations" + ) + events : Mapped[List["FetchEvent"]] = relationship( + back_populates = "installation", + cascade = "all, delete-orphan" + ) + + +class Environment(Base): + """ + Lookup table of host environments a fetch was executed in. + + Recording the environment in its own table - keyed by + ``(hostname, platform, python_version, polyskills_version)`` - + keeps these repeating strings out of the high-cardinality + :class:`FetchEvent` table while still allowing every event to be + attributed to the exact environment that produced it. + """ + + __tablename__ = "environments" + __table_args__ = ( + UniqueConstraint( + "hostname", "platform", "python_version", + "polyskills_version", name = "uq_environment_identity" + ), + ) + + id : Mapped[int] = mapped_column(primary_key = True) + hostname : Mapped[Optional[str]] = mapped_column(String(255)) + platform : Mapped[Optional[str]] = mapped_column(String(255)) + python_version : Mapped[Optional[str]] = mapped_column(String(64)) + polyskills_version : Mapped[Optional[str]] = mapped_column(String(64)) + + events : Mapped[List["FetchEvent"]] = relationship( + back_populates = "environment" + ) + + +class FetchEvent(Base): + """ + An append-only audit record of a single extension fetch attempt. + + One row is written per attempt - whether it succeeds or fails - so + the full history of an installation is preserved. The first-fetched + timestamp, the last-updated timestamp, and the current commit SHA + of an installation are all derived from these rows by aggregation + rather than being cached on :class:`Installation`. + + :type installation_id: int + :param installation_id: Foreign key into :class:`Installation`. + + :type environment_id: Optional[int] + :param environment_id: Foreign key into :class:`Environment`, or + ``None`` when the environment could not be captured. + + :type action: str + :param action: Concrete action taken - ``fetch``, ``overwrite``, + or ``merge`` - derived from the ``--exists`` policy. + + :type requested_version: str + :param requested_version: Tag, branch, or SHA requested by the + caller, for example ``master``. + + :type resolved_commit_sha: Optional[str] + :param resolved_commit_sha: The concrete commit SHA the requested + version resolved to, when it could be determined. + + :type status: str + :param status: Either ``success`` or ``failed``. + """ + + __tablename__ = "fetch_events" + + id : Mapped[int] = mapped_column(primary_key = True) + installation_id : Mapped[int] = mapped_column( + ForeignKey("installations.id", ondelete = "CASCADE"), + nullable = False, index = True + ) + environment_id : Mapped[Optional[int]] = mapped_column( + ForeignKey("environments.id"), nullable = True, index = True + ) + + action : Mapped[str] = mapped_column( + String(32), nullable = False + ) + requested_version : Mapped[str] = mapped_column( + String(255), nullable = False + ) + resolved_commit_sha : Mapped[Optional[str]] = mapped_column(String(64)) + status : Mapped[str] = mapped_column( + String(16), nullable = False, index = True + ) + error : Mapped[Optional[str]] = mapped_column(Text) + file_count : Mapped[Optional[int]] = mapped_column(Integer) + total_bytes : Mapped[Optional[int]] = mapped_column(Integer) + duration_ms : Mapped[Optional[int]] = mapped_column(Integer) + invoked_via : Mapped[str] = mapped_column( + String(16), nullable = False, default = "api" + ) + occurred_at : Mapped[datetime] = mapped_column( + DateTime, nullable = False, default = _utc_now, index = True + ) + + installation : Mapped["Installation"] = relationship( + back_populates = "events" + ) + environment : Mapped[Optional["Environment"]] = relationship( + back_populates = "events" + ) diff --git a/polyskills/database/paths.py b/polyskills/database/paths.py new file mode 100644 index 0000000..ac958b9 --- /dev/null +++ b/polyskills/database/paths.py @@ -0,0 +1,87 @@ +# -*- encoding: utf-8 -*- + +""" +Filesystem Path Resolver For The Polyskills Tracking Database +============================================================= + +Resolves the canonical on-disk location of the polyskills tracking +database. By design the database lives at a single, predictable path +under the user home directory so every installation of the framework - +regardless of the operating system - writes to the same source of +truth without the user having to learn an environment variable: + + ``~/.polyskills/records.db`` + +The location is derived from :meth:`pathlib.Path.home` which is fully +operating-system independent (it honours ``%USERPROFILE%`` on Windows +and ``$HOME`` on POSIX systems), so no platform branching is required. + +An optional ``POLYSKILLS_DB_PATH`` environment variable overrides the +default location. The override is primarily a testing and power-user +affordance; the value is expanded for both ``$VAR`` / ``%VAR%`` and +``~`` tokens to mirror the path-expansion contract used elsewhere in +the command-line interface. + +:NOTE: Path resolution is intentionally side-effect free. The parent + directory is created by the engine layer at connection time, not + here, so a read-only query never materialises an empty + ``~/.polyskills`` directory. +""" + +import os + +from pathlib import Path + +# ? canonical on-disk layout for the polyskills tracking database; the +# ? parent directory may also host future bookkeeping artefacts (for +# ? example lock files) so it is exposed as an importable constant. +POLYSKILLS_HOME : Path = Path.home() / ".polyskills" +DATABASE_FILENAME : str = "records.db" + +# ? optional override consulted before the default location is used; +# ? primarily exercised by the test-suite to redirect writes onto an +# ? isolated temporary file instead of the real user database. +ENVIRON_DB_PATH : str = "POLYSKILLS_DB_PATH" + + +def default_database_path() -> Path: + """ + Return the absolute path to the polyskills tracking database. + + The default location is ``~/.polyskills/records.db`` resolved + through :meth:`pathlib.Path.home` for operating-system + independence. When the ``POLYSKILLS_DB_PATH`` environment variable + is set its value takes precedence and is expanded for environment + variables and the ``~`` home token before being returned. + + The function performs no filesystem mutation: it neither creates + the parent directory nor the database file. Materialisation is the + responsibility of the engine layer so that read-only callers (for + example the ``polyskills records`` command) can resolve the path + without creating an empty ``~/.polyskills`` directory as a side + effect. + + **Example Usage** + + Resolve the default database path on any platform: + + .. code-block:: python + + from polyskills.database.paths import default_database_path + + print(default_database_path()) + >> /home/user/.polyskills/records.db # POSIX + >> C:\\Users\\user\\.polyskills\\records.db # Windows + + :rtype: pathlib.Path + :returns: Absolute path to the tracking database, either the + ``POLYSKILLS_DB_PATH`` override (when set and non-empty) or the + canonical ``~/.polyskills/records.db`` location. + """ + + override = os.environ.get(ENVIRON_DB_PATH) + if override: + expanded = os.path.expandvars(override) + return Path(expanded).expanduser().resolve() + + return (POLYSKILLS_HOME / DATABASE_FILENAME).resolve() diff --git a/polyskills/database/tracker.py b/polyskills/database/tracker.py new file mode 100644 index 0000000..5a11898 --- /dev/null +++ b/polyskills/database/tracker.py @@ -0,0 +1,594 @@ +# -*- encoding: utf-8 -*- + +""" +Best-Effort Tracking Façade For The Polyskills Database +======================================================= + +Provides the high-level, best-effort API the rest of the framework +uses to record extension fetches into and read them back from the +SQLite tracking database. The module is the only public entry point +for tracking; callers never touch :mod:`polyskills.database.models` +or :mod:`polyskills.database.engine` directly. + +Two responsibilities live here: + + * **writing** - :func:`record_fetch` upserts the normalised + ``source`` / ``remote`` / ``extension`` / ``installation`` / + ``environment`` rows for one fetch attempt and appends a single + :class:`~polyskills.database.models.FetchEvent`. + * **reading** - :func:`list_installations` and + :func:`get_installation` reconstruct the derived facts + (first-fetched, last-updated, current commit SHA) from the audit + log so the ``polyskills records`` command can present them. + +Every public function is engineered to be a no-op on failure: a +missing SQLAlchemy dependency, a read-only home directory, a locked +database, or any other fault is swallowed and surfaced as a single +per-process :class:`RuntimeWarning`. The install path of the framework +is therefore never broken by a tracking fault. + +:NOTE: SQLAlchemy and the sibling :mod:`~polyskills.database.engine` + and :mod:`~polyskills.database.models` modules are imported lazily + inside the functions below so that merely importing this module + (for example from :mod:`polyskills.cli`) never requires the + dependency to be installed. +""" + +import platform as _platform +import socket +import warnings + +from pathlib import Path + +from typing import Any, Dict, List, Optional, Tuple + +from polyskills.database import SQLALCHEMY_AVAILABLE +from polyskills.database.paths import default_database_path + +# ? process-wide cache of (engine, session-factory) keyed by the +# ? resolved database path so repeated calls in one process reuse a +# ? single connection pool instead of rebuilding the engine each time. +_FACTORY_CACHE : Dict[str, Tuple[Any, Any]] = {} + +# ? single per-process latch so a broken database warns exactly once +# ? rather than flooding stderr on every fetch in a batch run. +_WARNED : bool = False + +# ? invocation context persisted into ``fetch_events.invoked_via`` so +# ? downstream analytics can split CLI usage from direct API usage. +_INVOCATION_CONTEXT : str = "api" +_VALID_CONTEXTS : Tuple[str, ...] = ("cli", "api") + + +def set_invocation_context(context : str) -> None: + """ + Record the invocation context stamped onto subsequent fetch + events. + + :type context: str + :param context: One of ``"cli"`` or ``"api"``. Any other value is + silently coerced to ``"api"`` so a caller typo never breaks + the tracker. + + :rtype: None + :returns: Nothing - the context is stored in a module-level + variable consulted by :func:`record_fetch`. + """ + + global _INVOCATION_CONTEXT + _INVOCATION_CONTEXT = ( + context if context in _VALID_CONTEXTS else "api" + ) + + +def get_invocation_context() -> str: + """ + Return the invocation context recorded by + :func:`set_invocation_context`. + + :rtype: str + :returns: Either ``"cli"`` or ``"api"``. + """ + + return _INVOCATION_CONTEXT + + +def _warn_once(message : str) -> None: + """ + Emit a single :func:`warnings.warn` per process. + + :type message: str + :param message: The human-readable warning text. + + :rtype: None + :returns: Nothing - the warning is emitted as a side effect at + most once per process lifetime. + """ + + global _WARNED + if _WARNED: + return None + _WARNED = True + warnings.warn(message, RuntimeWarning, stacklevel = 3) + + +def _session_factory(database_path : Optional[Path]) -> Optional[Any]: + """ + Return a cached :class:`~sqlalchemy.orm.sessionmaker` for + ``database_path``, building and bootstrapping the engine on first + use. + + The function is the single guarded gateway to the persistence + layer: it short-circuits when SQLAlchemy is unavailable and + swallows any engine-construction error, returning ``None`` so + every caller degrades to a no-op. + + :type database_path: Optional[pathlib.Path] + :param database_path: Explicit database location, or ``None`` for + the canonical ``~/.polyskills/records.db``. + + :rtype: Optional[sqlalchemy.orm.sessionmaker] + :returns: A bound session factory, or ``None`` when tracking is + unavailable or the engine could not be built. + """ + + if not SQLALCHEMY_AVAILABLE: + _warn_once( + "polyskills tracking disabled: SQLAlchemy is not installed" + ) + return None + + resolved = database_path or default_database_path() + key = str(resolved) + + cached = _FACTORY_CACHE.get(key) + if cached is not None: + return cached[1] + + try: + from polyskills.database.engine import build_session_factory + engine, factory = build_session_factory(resolved) + except Exception as exc: # pragma: no cover - defensive guard + _warn_once(f"polyskills tracking disabled (init failed): {exc!r}") + return None + + _FACTORY_CACHE[key] = (engine, factory) + return factory + + +def _get_or_create( + session : Any, model : Any, + defaults : Optional[Dict[str, Any]] = None, **keys : Any +) -> Any: + """ + Fetch the row of ``model`` matching ``keys`` or create it. + + The lookup uses only the natural-key columns in ``keys``; any + ``defaults`` are applied solely on creation so a re-fetch never + rewrites immutable columns. The new row is flushed so its primary + key is available to the caller immediately. + + :type session: sqlalchemy.orm.Session + :param session: The active session the row is queried and added + through. + + :type model: type + :param model: The mapped model class to query or instantiate. + + :type defaults: Optional[Dict[str, Any]] + :param defaults: Extra column values applied only when a new row + is created. + + :rtype: object + :returns: The existing or newly created mapped instance. + """ + + from sqlalchemy.exc import IntegrityError + + instance = session.query(model).filter_by(**keys).one_or_none() + if instance is not None: + return instance + + params = dict(keys) + if defaults: + params.update(defaults) + + instance = model(**params) + try: + with session.begin_nested(): + session.add(instance) + session.flush() + except IntegrityError: + # ? a concurrent writer inserted the same natural key between the + # ? lookup and the flush; the SAVEPOINT rolls back only this + # ? insert, leaving the surrounding transaction intact, then the + # ? now-committed row is re-fetched. + instance = session.query(model).filter_by(**keys).one() + + return instance + + +def _resolve_environment(session : Any) -> Any: + """ + Return the :class:`~polyskills.database.models.Environment` row + describing the current host, creating it on first encounter. + + The environment identity is the tuple of hostname, platform + descriptor, Python version, and polyskills version, mirroring the + unique constraint on the table. + + :type session: sqlalchemy.orm.Session + :param session: The active session used for the get-or-create. + + :rtype: polyskills.database.models.Environment + :returns: The environment row matching the current host. + """ + + from polyskills.database.models import Environment + + try: + from polyskills import __version__ as polyskills_version + except Exception: # pragma: no cover - defensive guard + polyskills_version = "unknown" + + return _get_or_create( + session, Environment, + hostname = socket.gethostname(), + platform = _platform.platform(), + python_version = _platform.python_version(), + polyskills_version = polyskills_version, + ) + + +def record_fetch( + *, + source_name : str, + source_base_url : str, + owner : str, + repository : str, + remote_url : str, + library : str, + name : str, + source_dir : str, + install_path : str, + requested_version : str, + action : str, + status : str, + resolved_commit_sha : Optional[str] = None, + file_count : Optional[int] = None, + total_bytes : Optional[int] = None, + duration_ms : Optional[int] = None, + error : Optional[str] = None, + invoked_via : Optional[str] = None, + database_path : Optional[Path] = None, +) -> Optional[int]: + """ + Record one extension fetch attempt - success or failure - into the + tracking database. + + The call upserts the normalised provider, remote, extension, + installation, and environment rows and appends one + :class:`~polyskills.database.models.FetchEvent`. Every argument is + keyword-only so the long parameter list cannot be supplied + positionally by mistake. The whole operation is best-effort: any + failure is swallowed, warned once, and reported as a ``None`` + return so the caller's install path is never disrupted. + + :type source_name: str + :param source_name: Provider enumeration name, for example + ``"GITHUB"``. + + :type source_base_url: str + :param source_base_url: Provider base URL stored on first sight of + the provider. + + :type owner: str + :param owner: Repository owner slug. + + :type repository: str + :param repository: Repository name slug. + + :type remote_url: str + :param remote_url: Canonical remote URL as supplied by the caller. + + :type library: str + :param library: Extension kind - ``"skills"``, ``"agents"``, etc. + + :type name: str + :param name: Extension leaf name. + + :type source_dir: str + :param source_dir: POSIX source directory on the remote. + + :type install_path: str + :param install_path: Absolute, resolved destination directory. + + :type requested_version: str + :param requested_version: Requested tag, branch, or SHA. + + :type action: str + :param action: Concrete action - ``"fetch"``, ``"overwrite"``, or + ``"merge"``. + + :type status: str + :param status: ``"success"`` or ``"failed"``. + + **Keyword Arguments** + + The remaining keyword-only arguments enrich the event row and are + all optional, defaulting to ``None`` when the caller cannot + determine them. + + * **resolved_commit_sha** (*Optional[str]*): Concrete commit + SHA the requested version resolved to. + + * **file_count** (*Optional[int]*): Files written to the + destination on a successful fetch. + + * **total_bytes** (*Optional[int]*): Total on-disk size of the + written files. + + * **duration_ms** (*Optional[int]*): Wall-clock duration of + the attempt in milliseconds. + + * **error** (*Optional[str]*): Error description recorded on a + failed attempt. + + * **invoked_via** (*Optional[str]*): Override for the recorded + invocation context; defaults to the process context. + + * **database_path** (*Optional[pathlib.Path]*): Explicit + database location, primarily for tests. + + :rtype: Optional[int] + :returns: The primary key of the inserted ``fetch_events`` row, or + ``None`` when tracking was unavailable or the write failed. + """ + + factory = _session_factory(database_path) + if factory is None: + return None + + from polyskills.database.models import ( + Extension, FetchEvent, Installation, Library, Remote, Source + ) + + context = invoked_via if invoked_via in _VALID_CONTEXTS \ + else get_invocation_context() + + try: + with factory() as session: + source = _get_or_create( + session, Source, + defaults = {"base_url": source_base_url}, + name = source_name, + ) + lib = _get_or_create(session, Library, name = library) + remote = _get_or_create( + session, Remote, defaults = {"url": remote_url}, + source_id = source.id, owner = owner, + repository = repository, + ) + extension = _get_or_create( + session, Extension, + remote_id = remote.id, library_id = lib.id, + name = name, source_dir = source_dir, + ) + installation = _get_or_create( + session, Installation, + extension_id = extension.id, install_path = install_path, + ) + + environment = _resolve_environment(session) + + event = FetchEvent( + installation_id = installation.id, + environment_id = environment.id, + action = action, + requested_version = requested_version, + resolved_commit_sha = resolved_commit_sha, + status = status, + error = error, + file_count = file_count, + total_bytes = total_bytes, + duration_ms = duration_ms, + invoked_via = context, + ) + session.add(event) + session.commit() + return int(event.id) + except Exception as exc: + _warn_once(f"polyskills tracker record_fetch failed: {exc!r}") + return None + + +def _summarise_events(events : List[Any]) -> Dict[str, Any]: + """ + Reduce an installation's :class:`FetchEvent` rows to the derived + facts presented by the read API. + + :type events: List[polyskills.database.models.FetchEvent] + :param events: All events belonging to one installation, in any + order. + + :rtype: Dict[str, Any] + :returns: A mapping with the first-fetched and last-updated + timestamps (successful events only), the latest commit SHA and + requested version, the latest status, and the total attempt + count. + """ + + ordered = sorted(events, key = lambda event : event.occurred_at) + successes = [ + event for event in ordered if event.status == "success" + ] + latest = ordered[-1] if ordered else None + latest_success = successes[-1] if successes else None + + return { + "first_fetched_at": successes[0].occurred_at if successes else None, + "last_updated_at": latest_success.occurred_at + if latest_success else None, + "resolved_commit_sha": latest_success.resolved_commit_sha + if latest_success else None, + "requested_version": latest.requested_version if latest else None, + "last_status": latest.status if latest else None, + "fetch_count": len(ordered), + } + + +def list_installations( + database_path : Optional[Path] = None +) -> List[Dict[str, Any]]: + """ + Return one summary row per tracked installation. + + Each summary joins the installation to its extension, remote, and + library and folds the audit log into the derived first-fetched, + last-updated, and current-commit facts. The function is read-only + and best-effort: when tracking is unavailable, the database does + not yet exist, or a query fails, an empty list is returned. + + :type database_path: Optional[pathlib.Path] + :param database_path: Explicit database location, or ``None`` for + the canonical location. + + :rtype: List[Dict[str, Any]] + :returns: A list of summary mappings sorted by extension name then + install path, each describing one installation. + """ + + factory = _session_factory(database_path) + if factory is None: + return [] + + from polyskills.database.models import Installation + + try: + with factory() as session: + installations = session.query(Installation).all() + rows : List[Dict[str, Any]] = [] + for installation in installations: + extension = installation.extension + remote = extension.remote + summary = _summarise_events(installation.events) + summary.update({ + "name": extension.name, + "library": extension.library.name, + "owner": remote.owner, + "repository": remote.repository, + "remote_url": remote.url, + "source_dir": extension.source_dir, + "install_path": installation.install_path, + }) + rows.append(summary) + except Exception as exc: + _warn_once(f"polyskills tracker list_installations failed: {exc!r}") + return [] + + return sorted( + rows, key = lambda row : (row["name"], row["install_path"]) + ) + + +def get_installation( + name : str, database_path : Optional[Path] = None +) -> List[Dict[str, Any]]: + """ + Return the detailed records for every installation of the + extension called ``name``. + + The detail extends the :func:`list_installations` summary with the + full chronological event history of each installation so the + ``polyskills records --name`` view can show every fetch attempt. + + :type name: str + :param name: Extension leaf name to look up. + + :type database_path: Optional[pathlib.Path] + :param database_path: Explicit database location, or ``None`` for + the canonical location. + + :rtype: List[Dict[str, Any]] + :returns: A list of detail mappings (one per matching + installation), each carrying an ``events`` list of per-attempt + mappings. Empty when no match is found or tracking is + unavailable. + """ + + factory = _session_factory(database_path) + if factory is None: + return [] + + from polyskills.database.models import Extension, Installation + + try: + with factory() as session: + installations = ( + session.query(Installation) + .join(Extension) + .filter(Extension.name == name) + .all() + ) + details : List[Dict[str, Any]] = [] + for installation in installations: + extension = installation.extension + remote = extension.remote + summary = _summarise_events(installation.events) + summary.update({ + "name": extension.name, + "library": extension.library.name, + "owner": remote.owner, + "repository": remote.repository, + "remote_url": remote.url, + "source_dir": extension.source_dir, + "install_path": installation.install_path, + "events": [ + { + "occurred_at": event.occurred_at, + "action": event.action, + "status": event.status, + "requested_version": event.requested_version, + "resolved_commit_sha": + event.resolved_commit_sha, + "file_count": event.file_count, + "total_bytes": event.total_bytes, + "duration_ms": event.duration_ms, + "invoked_via": event.invoked_via, + "error": event.error, + } + for event in sorted( + installation.events, + key = lambda event : event.occurred_at + ) + ], + }) + details.append(summary) + except Exception as exc: + _warn_once(f"polyskills tracker get_installation failed: {exc!r}") + return [] + + return details + + +def reset_cache_for_tests() -> None: + """ + Dispose every cached engine and clear the factory cache. + + Used exclusively by the test-suite between cases that point the + tracker at different temporary databases so a stale engine is + never reused across an isolated fixture. + + :rtype: None + :returns: Nothing - the module-level cache is emptied as a side + effect. + """ + + global _WARNED + for engine, _factory in _FACTORY_CACHE.values(): + try: + engine.dispose() + except Exception: # nosec B110 # pragma: no cover - best-effort + pass + _FACTORY_CACHE.clear() + _WARNED = False diff --git a/polyskills/error/__init__.py b/polyskills/error/__init__.py index 3e0f8f8..ac05371 100644 --- a/polyskills/error/__init__.py +++ b/polyskills/error/__init__.py @@ -10,5 +10,15 @@ from polyskills.error import warnings from polyskills.error import exceptions +from polyskills.error.exceptions import ( + PolyskillsError, + ValidationError, + RemoteError, + ExtractionError, +) -__all__ = ["exceptions", "warnings"] +__all__ = [ + "exceptions", "warnings", + "PolyskillsError", "ValidationError", + "RemoteError", "ExtractionError", +] diff --git a/polyskills/error/exceptions.py b/polyskills/error/exceptions.py index e69de29..4205f44 100644 --- a/polyskills/error/exceptions.py +++ b/polyskills/error/exceptions.py @@ -0,0 +1,49 @@ +# -*- encoding: utf-8 -*- + +""" +Custom Exception Hierarchy for Polyskills +----------------------------------------- + +A small, explicit exception hierarchy rooted at :class:`PolyskillsError` +so callers can distinguish the failure modes of the remote pipeline - +input validation, remote transport, and archive extraction - from +unrelated runtime errors, while a single ``except PolyskillsError`` still +catches every domain failure raised by the package. + +:NOTE: :class:`ValidationError` also derives from the builtin + :class:`ValueError` to preserve backward compatibility with callers + and tests that catch ``ValueError`` for an invalid input. +""" + + +class PolyskillsError(Exception): + """ + Base class for every :mod:`polyskills` domain error. + + Catch this to handle any failure raised deliberately by the package + while letting genuinely unexpected exceptions propagate untouched. + """ + + +class ValidationError(PolyskillsError, ValueError): + """ + Raised when a user-supplied argument fails validation - for example + an unknown ``mode``, a missing required value, an unsupported remote + URL, or a conflicting TLS option. Subclasses :class:`ValueError` so + that existing ``except ValueError`` handlers keep working. + """ + + +class RemoteError(PolyskillsError): + """ + Raised when a remote source behaves unexpectedly, such as a tag + pagination chain that exceeds the configured page safety limit. + """ + + +class ExtractionError(PolyskillsError): + """ + Raised when a downloaded archive violates a safety constraint - a + member escaping the destination directory (path traversal), or the + download / extraction exceeding its configured size limit. + """ diff --git a/polyskills/remote/sources.py b/polyskills/remote/sources.py index b3493a2..8b2af6a 100644 --- a/polyskills/remote/sources.py +++ b/polyskills/remote/sources.py @@ -30,8 +30,32 @@ from enum import Enum from pathlib import Path +from urllib.parse import quote from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +from polyskills import __version__ +from polyskills.error.exceptions import ( + ValidationError, RemoteError, ExtractionError +) + +# ! security limits guarding the remote download / extraction pipeline +# ? the compressed download is capped to bound disk usage, and the +# ? cumulative uncompressed size is capped to defang gzip / tar bombs +_CHUNK_SIZE : int = 1 << 14 +_MAX_ARCHIVE_BYTES : int = 100 * (1 << 20) +_MAX_EXTRACT_BYTES : int = 500 * (1 << 20) + +# ? upper bound on REST pagination so a misbehaving / hostile server +# ? cannot keep the client looping over an endless `next` link chain +_MAX_PAGES : int = 100 + +# ? request timeouts (seconds) and the User-Agent advertised to the +# ? remote REST API so the client is identifiable in the server logs +_REQUEST_TIMEOUT : int = 30 +_DOWNLOAD_TIMEOUT : int = 60 +_USER_AGENT : str = f"polyskills/{__version__}" + class ValidSources(Enum): GITHUB = "https://www.github.com/" @@ -54,6 +78,14 @@ class SourceControl: :param token: A authorization token to access the remote URL, this value is discouraged to be used in a production system. + :type verify: Union[bool, str] + :param verify: TLS verification forwarded verbatim to every + :func:`requests.get` call. ``True`` (default) uses the standard + trust store and honours the ``REQUESTS_CA_BUNDLE`` / + ``SSL_CERT_FILE`` environment overrides, a string pins a CA + bundle path, and ``False`` disables verification entirely + (discouraged, exposed via the ``--no-verify`` CLI flag). + Each parameter should be available as a CLI command for dynamic control which sets the data class defaults during runtime. """ @@ -63,6 +95,10 @@ class SourceControl: # ? Token Attribute: Useful only in testing environment token : Optional[str] = None + # ? TLS verification policy forwarded to every remote request; + # ? defaults to secure verification, see the param docstring above + verify : Union[bool, str] = True + class SourceManager(abc.ABC): """ @@ -120,14 +156,14 @@ def getSlug(self, remote : str) -> Tuple[str, str]: )) >> ...api.github.com/repos/PyUtility/polyskills/tags?... - :raises ValueError: If the remote URL does not match the + :raises ValidationError: If the remote URL does not match the regular expression pattern supported by the module. """ matches = self.remotePattern.match(remote) if not matches: - raise ValueError( + raise ValidationError( f"Not a Valid URL: {remote}, or Not Supported." ) @@ -213,8 +249,11 @@ def get(self, remote : str, mode : str = "tags", **kwargs) -> Any: defaults to ``master`` that is the latest content from the repository. (TODO) - - **formatter** (*Callable*) - Formatter method based - on the final LLM tool. (TODO) + - **formatter** (*Callable*) - Reserved hook for the + planned skill-to-prompt conversion used by non + Agent-Skills tools. It is forwarded end-to-end but + is a no-op today (never invoked); kept so the public + contract is stable when the feature lands. """ prefix : Optional[str] = kwargs.get("prefix", None) @@ -242,19 +281,28 @@ def get(self, remote : str, mode : str = "tags", **kwargs) -> Any: # always use lower case naming for modes; in-built control mode = mode.lower() supported = ["tags", "extensions", "list"] - assert mode in supported, \ - f"Mode = `{mode}` is not in {supported}" + if mode not in supported: + raise ValidationError( + f"Mode = `{mode}` is not in {supported}." + ) # ! validate that positional required arguments are available # only enforced for the ``extensions`` mode where they are used + # ! raised explicitly (not ``assert``) so the checks survive a + # ! ``python -O`` optimised run where assertions are stripped if mode == "extensions": - assert name is not None, "Extension name cannot be null." - assert library is not None, \ - f"Library cannot be null, supported are {supported}" + if name is None: + raise ValidationError("Extension name cannot be null.") + if library is None: + raise ValidationError( + f"Library cannot be null, supported are {supported}." + ) if mode == "list": - assert library is not None, \ - f"Library cannot be null, supported are {supported}" + if library is None: + raise ValidationError( + f"Library cannot be null, supported are {supported}." + ) # lazy dispatch: only the requested branch is invoked methods : Dict[str, Callable[[], Any]] = dict( @@ -271,6 +319,34 @@ def get(self, remote : str, mode : str = "tags", **kwargs) -> Any: return methods[mode]() # return the underlying assets + def resolveCommit( + self, remote : str, version : str = "master", **kwargs + ) -> Optional[str]: + """ + Resolve ``version`` (a tag, branch, or SHA) to the concrete + commit SHA it points at on the remote. + + The base implementation returns ``None``; providers that can + cheaply resolve a reference to a commit SHA override this so + the tracking database can record the exact commit a fetch + landed on. Returning ``None`` is always acceptable and simply + leaves the recorded SHA empty - resolution is never required + for a fetch to succeed. + + :type remote: str + :param remote: Remote URL forwarded to the provider. + + :type version: str + :param version: The tag, branch, or SHA to resolve. + + :rtype: Optional[str] + :returns: The resolved commit SHA, or ``None`` when it cannot + be determined. + """ + + return None + + @abc.abstractmethod def _getTags( self, remote : str, prefix : Optional[str] = None @@ -403,6 +479,7 @@ def headers(self) -> Dict[str, str]: headers = { "accept" : "application/vnd.github+json", "X-GitHub-Api-Version" : "2022-11-28", + "user-agent" : _USER_AGENT, } if self._token: @@ -424,7 +501,7 @@ def _getTags( get tags hosted in ``skillName@vX.Y.Z`` format. """ - timeout = kwargs.get("timeout", 30) + timeout = kwargs.get("timeout", _REQUEST_TIMEOUT) owner, repository = self.getSlug(remote = remote) remote_url = self.remoteAPI.format( owner = owner, repository = repository, @@ -433,9 +510,17 @@ def _getTags( tags : List[str] = [] + pages = 0 while remote_url: + if pages >= _MAX_PAGES: + raise RemoteError( + f"Tag pagination exceeded the {_MAX_PAGES} page " + "safety limit." + ) + pages += 1 + response = requests.get( - remote_url, verify = False, + remote_url, verify = self.control.verify, headers = self.headers, timeout = timeout ) response.raise_for_status() @@ -446,10 +531,65 @@ def _getTags( tags.append(item["name"]) remote_url = response.links.get("next", {}).get("url") - + return tags + def resolveCommit( + self, remote : str, version : str = "master", **kwargs + ) -> Optional[str]: + """ + Resolve ``version`` to a commit SHA using the GitHub commits + REST endpoint ``GET /repos/{owner}/{repository}/commits/{ref}``. + + The request sets the ``Accept: application/vnd.github.sha`` + media type so the endpoint returns the 40-character SHA as a + plain-text body instead of the full commit payload, keeping + the lookup cheap. TLS verification follows the configured + :attr:`SourceControl.verify` policy, identical to every other + remote request, so it is never silently disabled. The call is + best-effort: any transport or HTTP error yields ``None`` so a + tracking lookup never raises into the fetch path. + + :type remote: str + :param remote: GitHub remote URL. + + :type version: str + :param version: Tag, branch, or SHA to resolve, defaults to + ``master``. + + :rtype: Optional[str] + :returns: The resolved commit SHA, or ``None`` on any failure. + """ + + timeout = kwargs.get("timeout", 30) + + try: + owner, repository = self.getSlug(remote = remote) + except ValueError: + return None + + uri = ( + "https://api.github.com/repos/" + "{owner}/{repository}/commits/{ref}" + ).format(owner = owner, repository = repository, ref = version) + + headers = dict(self.headers) + headers["accept"] = "application/vnd.github.sha" + + try: + response = requests.get( + uri, verify = self.control.verify, + headers = headers, timeout = timeout + ) + response.raise_for_status() + except requests.RequestException: + return None + + sha = response.text.strip() + return sha or None + + def _getExtensions( self, remote : str, name : str, source : Path, destination : Path, version : str = "master", @@ -469,14 +609,21 @@ def _getExtensions( to ``fail``. """ - assert exists in ("fail", "overwrite", "merge"), \ - f"exists = `{exists}` not in ['fail', 'overwrite', 'merge']" + if exists not in ("fail", "overwrite", "merge"): + raise ValidationError( + f"exists = `{exists}` not in " + "['fail', 'overwrite', 'merge']." + ) owner, repository = self.getSlug(remote = remote) uri = ( "https://api.github.com/repos/" "{owner}/{repository}/tarball/{tag}" - ).format(owner = owner, repository = repository, tag = version) + ).format( + owner = quote(owner, safe = ""), + repository = quote(repository, safe = ""), + tag = quote(str(version), safe = "/") + ) destination = Path(destination) if destination.exists() and destination.is_dir() \ @@ -503,12 +650,23 @@ def _getExtensions( try: with requests.get( uri, headers = self.headers, stream = True, - verify = False, timeout = 60, allow_redirects = True + verify = self.control.verify, + timeout = _DOWNLOAD_TIMEOUT, allow_redirects = True ) as response: response.raise_for_status() - for chunk in response.iter_content(chunk_size = 1 << 14): - if chunk: - handle.write(chunk) + downloaded = 0 + for chunk in response.iter_content( + chunk_size = _CHUNK_SIZE + ): + if not chunk: + continue + downloaded += len(chunk) + if downloaded > _MAX_ARCHIVE_BYTES: + raise ExtractionError( + "Download exceeds the safety limit of " + f"{_MAX_ARCHIVE_BYTES} bytes." + ) + handle.write(chunk) handle.flush() handle.close() @@ -522,7 +680,12 @@ def _getExtensions( root = members[0].name.split("/", 1)[0] prefix = f"{root}/{target}/" + # ! resolve the destination once so every member can + # ! be verified to stay inside this subtree below + safe_root = destination.resolve() + found = False + extracted_bytes = 0 for member in members: if not member.name.startswith(prefix): continue @@ -532,9 +695,26 @@ def _getExtensions( continue out_path = destination / relative + + # ! defend against path traversal - a crafted + # ! member may carry '..' segments that escape + resolved = (safe_root / relative).resolve() + if resolved != safe_root \ + and safe_root not in resolved.parents: + raise ExtractionError( + f"Unsafe archive member `{member.name}` " + "escapes the destination directory." + ) + if member.isdir(): out_path.mkdir(parents = True, exist_ok = True) elif member.isfile(): + extracted_bytes += max(member.size, 0) + if extracted_bytes > _MAX_EXTRACT_BYTES: + raise ExtractionError( + "Extraction exceeds the safety limit " + f"of {_MAX_EXTRACT_BYTES} bytes." + ) out_path.parent.mkdir( parents = True, exist_ok = True ) @@ -544,6 +724,8 @@ def _getExtensions( with extracted as src_fp, \ open(out_path, "wb") as dst_fp: shutil.copyfileobj(src_fp, dst_fp) + # ? symlink / hardlink / device / fifo members + # ? are neither dir nor file - skipped on purpose if not found: raise FileNotFoundError( @@ -571,7 +753,7 @@ def _listExtensions( is sorted alphabetically for deterministic CLI output. """ - timeout = kwargs.get("timeout", 30) + timeout = kwargs.get("timeout", _REQUEST_TIMEOUT) owner, repository = self.getSlug(remote = remote) sub = Path(str(source)).as_posix().lstrip("./").strip("/") @@ -579,12 +761,14 @@ def _listExtensions( "https://api.github.com/repos/" "{owner}/{repository}/contents/{path}?ref={ref}" ).format( - owner = owner, repository = repository, - path = sub, ref = version + owner = quote(owner, safe = ""), + repository = quote(repository, safe = ""), + path = quote(sub, safe = "/"), + ref = quote(str(version), safe = "") ) response = requests.get( - uri, verify = False, + uri, verify = self.control.verify, headers = self.headers, timeout = timeout ) response.raise_for_status() diff --git a/polyskills/tests/__init__.py b/polyskills/tests/__init__.py index eccc767..7216861 100644 --- a/polyskills/tests/__init__.py +++ b/polyskills/tests/__init__.py @@ -5,10 +5,10 @@ ===================================== A :mod:`unittest` based automated test-suite for the :mod:`polyskills` -package. The tests exercise the public dispatcher -:meth:`polyskills.remote.sources.GithubManager.get` end-to-end against -a live GitHub repository to validate the contracts surfaced by the -remote source manager. +package. The default run is hermetic: the CLI, path-resolution, and the +security regression tests (TLS forwarding, archive path-traversal, size +caps, pagination cap) mock the network and synthesise archive fixtures +in memory, so no connection is needed. Run from the repository root:: @@ -16,13 +16,30 @@ # or, equivalently python -m polyskills.tests -:NOTE: These tests perform live HTTP requests against -``https://api.github.com``. A network connection is required, and the -GitHub anonymous rate-limit may throttle very frequent runs - export -``POLYSKILLS_REMOTE_TOKEN`` (or set the workflow's ``GITHUB_TOKEN``) -to authenticate the requests. +:NOTE: The end-to-end tests that hit the live +``https://api.github.com`` REST API and the PyUtility/polyskills +repository are opt-in - set ``POLYSKILLS_RUN_LIVE=1`` to enable them. +Behind a TLS-intercepting proxy also set ``POLYSKILLS_NO_VERIFY=1`` so +the live fixture can reach GitHub, and export ``POLYSKILLS_REMOTE_TOKEN`` +(or the workflow's ``GITHUB_TOKEN``) to raise the anonymous rate-limit. """ +import os +import tempfile + __version__ = "v1.0.0" +# ? Redirect the tracking database onto an isolated temporary file for +# ? the whole test run so no test ever reads from or writes to the real +# ? user-level ``~/.polyskills/records.db``. ``default_database_path`` +# ? consults this variable at call time, so setting it here - before +# ? any test triggers a tracking write - is sufficient. The value is +# ? only a safety net; the dedicated database tests still pass an +# ? explicit path for strict per-test isolation. ``setdefault`` keeps +# ? an externally supplied override (for example in CI) intact. +os.environ.setdefault( + "POLYSKILLS_DB_PATH", + os.path.join(tempfile.gettempdir(), "polyskills-test-records.db") +) + # init-time options registrations, use base.py for shared fixtures diff --git a/polyskills/tests/base.py b/polyskills/tests/base.py index 49868b3..a6af975 100644 --- a/polyskills/tests/base.py +++ b/polyskills/tests/base.py @@ -10,6 +10,7 @@ the suite remains DRY. """ +import os import shutil import logging import tempfile @@ -23,7 +24,7 @@ from polyskills.remote.sources import GithubManager, SourceControl -# silence verify=False noise + GitHub redirect warnings during runs +# silence insecure-request + GitHub redirect noise for opt-in live runs warnings.filterwarnings("ignore") urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -41,11 +42,21 @@ EXPECTED : Set[str] = {"SKILL.md"} # minimum file expected after extract PAGINATION : int = 100 +# ? live-network tests are opt-in to keep the default test run hermetic +# ? and offline; set POLYSKILLS_RUN_LIVE=1 to exercise them. Behind a +# ? TLS-intercepting proxy also set POLYSKILLS_NO_VERIFY=1 so the live +# ? fixture can reach GitHub without certificate verification. +_RUN_LIVE : str = os.environ.get("POLYSKILLS_RUN_LIVE", "") +_VERIFY : bool = not os.environ.get("POLYSKILLS_NO_VERIFY") + # ? canonical user-level skill installation root (platform-agnostic via Path.home()) USER_SKILL_ROOT : Path = Path.home() / ".claude" / "skills" SKILL_FILENAME : str = "SKILL.md" +@unittest.skipUnless( + _RUN_LIVE, "live-network tests; set POLYSKILLS_RUN_LIVE=1 to run" +) class GithubManagerTestCase(unittest.TestCase): """ A reusable :class:`unittest.TestCase` base that provisions a @@ -70,7 +81,9 @@ class to avoid re-creating the HTTP session needlessly. f"polyskills.tests.{cls.__name__}" ) cls.manager = GithubManager( - control = SourceControl(pagination = PAGINATION) + control = SourceControl( + pagination = PAGINATION, verify = _VERIFY + ) ) diff --git a/polyskills/tests/test_cli.py b/polyskills/tests/test_cli.py index a70343e..97fc821 100644 --- a/polyskills/tests/test_cli.py +++ b/polyskills/tests/test_cli.py @@ -560,19 +560,23 @@ def test_manager_supports_agents_library(self) -> None: ) - def test_unsupported_remote_propagates_value_error(self) -> None: + def test_unsupported_remote_exits_with_clean_error(self) -> None: """ - :func:`cli.main` must not swallow the :class:`ValueError` - raised by :func:`cli._resolveManager` for an unknown remote - - the surface error is the user's signal to use a supported - provider. + :func:`cli.main` must render the :class:`ValueError` raised by + :func:`cli._resolveManager` for an unknown remote as a concise + ``[ERROR]`` message on stderr and exit non-zero, instead of + leaking a raw traceback to the user. """ - with self.assertRaises(ValueError): - _run_main([ - "manager", "https://gitlab.com/foo/bar", - "--name", _NAME, "skills" - ]) + code, _, err = _run_main([ + "manager", "https://gitlab.com/foo/bar", + "--name", _NAME, "skills" + ]) + + self.assertEqual(code, 1) + self.assertIn("[ERROR]", err) + self.assertIn("not supported", err) + self.assertNotIn("Traceback", err) class TestDestinationExpansion(unittest.TestCase): diff --git a/polyskills/tests/test_database.py b/polyskills/tests/test_database.py new file mode 100644 index 0000000..12386d9 --- /dev/null +++ b/polyskills/tests/test_database.py @@ -0,0 +1,452 @@ +# -*- encoding: utf-8 -*- + +""" +Offline Tests For The Polyskills Tracking Database +================================================== + +Hermetic :mod:`unittest` cases for the :mod:`polyskills.database` +sub-package. Every case is fully offline - no network is touched - and +each tracker case writes to its own temporary SQLite file passed +explicitly as ``database_path`` so the real user-level database is +never read or written. + +The cases cover four concerns: + + * :class:`TestDatabasePaths` - the OS-independent path resolver and + its ``POLYSKILLS_DB_PATH`` override. + * :class:`TestSchemaAndEngine` - schema materialisation, the SQLite + pragmas, and the ``ON DELETE CASCADE`` behaviour. + * :class:`TestTracker` - the write and read façade: event recording, + strict-normalisation of the lookup rows, derived first / last / + SHA facts, failure capture, and graceful degradation. + * :class:`TestTrackingCliArguments` - argparse construction of the + new ``--no-tracking`` flag and the ``records`` subcommand. +""" + +import os +import shutil +import tempfile +import unittest + +from pathlib import Path +from unittest import mock + +from polyskills import cli +from polyskills import database as db +from polyskills.database import engine as db_engine +from polyskills.database import tracker as db_tracker +from polyskills.database.paths import default_database_path + +# ? a representative, fully-specified fetch used by the tracker cases; +# ? individual tests override single keys through ``_record`` below. +_FETCH = dict( + source_name = "GITHUB", + source_base_url = "https://www.github.com/", + owner = "PyUtility", + repository = "polyskills", + remote_url = "https://github.com/PyUtility/polyskills", + library = "skills", + name = "sql-code-format", + source_dir = "extensions/skills", + install_path = "/home/user/.claude/skills/sql-code-format", + requested_version = "master", + action = "fetch", + status = "success", +) + +_EXPECTED_TABLES = { + "sources", "libraries", "remotes", "extensions", + "installations", "environments", "fetch_events", +} + + +class TestDatabasePaths(unittest.TestCase): + """ + Path-resolution tests for :func:`default_database_path`. + + These cases make no network request and write nothing to disk; they + only assert the resolved path string and the override contract. + """ + + def test_default_is_records_db_under_polyskills_home(self) -> None: + """ + With no override, the path must end in ``.polyskills/records.db`` + and be absolute on every platform. + """ + + with mock.patch.dict(os.environ, {}, clear = False): + os.environ.pop("POLYSKILLS_DB_PATH", None) + path = default_database_path() + + self.assertTrue(path.is_absolute()) + self.assertEqual(path.name, "records.db") + self.assertEqual(path.parent.name, ".polyskills") + + + def test_environment_override_is_honoured(self) -> None: + """ + ``POLYSKILLS_DB_PATH`` must take precedence and be expanded for + the ``~`` home token. + """ + + with mock.patch.dict(os.environ, {}, clear = False): + os.environ["POLYSKILLS_DB_PATH"] = os.path.join("~", "x.db") + path = default_database_path() + + self.assertEqual(path.name, "x.db") + self.assertNotIn("~", path.as_posix()) + + + def test_resolution_is_side_effect_free(self) -> None: + """ + Resolving the default path must not create the parent + directory; materialisation belongs to the engine layer. + """ + + staging = Path(tempfile.mkdtemp()) + target = staging / "nested" / "records.db" + try: + with mock.patch.dict(os.environ, {}, clear = False): + os.environ["POLYSKILLS_DB_PATH"] = str(target) + resolved = default_database_path() + self.assertFalse(resolved.parent.exists()) + finally: + shutil.rmtree(staging, ignore_errors = True) + + +class TestSchemaAndEngine(unittest.TestCase): + """ + Engine and schema tests for :mod:`polyskills.database.engine`. + """ + + def setUp(self) -> None: + """Allocate an isolated temporary database file per test.""" + + self.staging = Path(tempfile.mkdtemp(prefix = "polyskills-db-")) + self.db = self.staging / "records.db" + + + def tearDown(self) -> None: + """Dispose cached engines and remove the staging directory.""" + + db_tracker.reset_cache_for_tests() + shutil.rmtree(self.staging, ignore_errors = True) + + + def test_build_session_factory_creates_every_table(self) -> None: + """ + :func:`build_session_factory` must materialise all seven + normalised tables. + """ + + from sqlalchemy import inspect + + engine, _factory = db_engine.build_session_factory(self.db) + names = set(inspect(engine).get_table_names()) + engine.dispose() + + self.assertEqual(names, _EXPECTED_TABLES) + + + def test_sqlite_pragmas_are_applied(self) -> None: + """ + Every connection must enable foreign keys and use the WAL + journal so the schema's cascade behaves correctly. + """ + + engine, _factory = db_engine.build_session_factory(self.db) + try: + with engine.connect() as connection: + fk = connection.exec_driver_sql( + "PRAGMA foreign_keys" + ).scalar() + journal = connection.exec_driver_sql( + "PRAGMA journal_mode" + ).scalar() + finally: + engine.dispose() + + self.assertEqual(fk, 1) + self.assertEqual(str(journal).lower(), "wal") + + + def test_deleting_installation_cascades_to_events(self) -> None: + """ + Removing an installation must cascade-delete its fetch events + so no orphan audit rows remain. + """ + + from polyskills.database.models import ( + Extension, FetchEvent, Installation, Library, Remote, Source + ) + + engine, factory = db_engine.build_session_factory(self.db) + try: + with factory() as session: + source = Source( + name = "GITHUB", base_url = "https://www.github.com/" + ) + library = Library(name = "skills") + session.add_all([source, library]) + session.flush() + remote = Remote( + source_id = source.id, owner = "PyUtility", + repository = "polyskills", + url = "https://github.com/PyUtility/polyskills" + ) + session.add(remote) + session.flush() + extension = Extension( + remote_id = remote.id, library_id = library.id, + name = "x", source_dir = "extensions/skills" + ) + session.add(extension) + session.flush() + installation = Installation( + extension_id = extension.id, install_path = "/tmp/x" + ) + session.add(installation) + session.flush() + session.add(FetchEvent( + installation_id = installation.id, action = "fetch", + requested_version = "master", status = "success" + )) + session.commit() + + session.delete(installation) + session.commit() + self.assertEqual(session.query(FetchEvent).count(), 0) + finally: + engine.dispose() + + +class TestTracker(unittest.TestCase): + """ + Write and read tests for :mod:`polyskills.database.tracker`. + + Each case targets a private temporary database via the explicit + ``database_path`` argument, so the suite-level redirect is never + relied upon for correctness. + """ + + def setUp(self) -> None: + """Allocate a temp database and clear the engine cache.""" + + self.staging = Path(tempfile.mkdtemp(prefix = "polyskills-trk-")) + self.db = self.staging / "records.db" + db_tracker.reset_cache_for_tests() + db_tracker.set_invocation_context("api") + + + def tearDown(self) -> None: + """Dispose cached engines and remove the staging directory.""" + + db_tracker.reset_cache_for_tests() + shutil.rmtree(self.staging, ignore_errors = True) + + + def _record(self, **overrides) -> object: + """Record a fetch built from ``_FETCH`` plus ``overrides``.""" + + payload = dict(_FETCH) + payload.update(overrides) + return db.record_fetch(database_path = self.db, **payload) + + + def test_record_fetch_persists_and_returns_event_id(self) -> None: + """A successful record must return the new event primary key.""" + + event_id = self._record( + resolved_commit_sha = "abc123", file_count = 3, + total_bytes = 4096, duration_ms = 120 + ) + self.assertIsInstance(event_id, int) + + + def test_repeated_fetch_reuses_rows_and_normalises(self) -> None: + """ + Two fetches of the same extension into the same path must yield + one installation and two events while the lookup tables stay + deduplicated - the core normalisation guarantee. + """ + + from polyskills.database.models import ( + Extension, FetchEvent, Installation, Remote, Source + ) + + self._record(resolved_commit_sha = "aaa") + self._record(resolved_commit_sha = "bbb") + + _engine, factory = db_engine.build_session_factory(self.db) + try: + with factory() as session: + self.assertEqual(session.query(Source).count(), 1) + self.assertEqual(session.query(Remote).count(), 1) + self.assertEqual(session.query(Extension).count(), 1) + self.assertEqual(session.query(Installation).count(), 1) + self.assertEqual(session.query(FetchEvent).count(), 2) + finally: + _engine.dispose() + + + def test_distinct_extensions_same_path_not_conflated(self) -> None: + """ + Two different extensions fetched into the same install path must + yield two installations with independent histories, never one + installation silently relabelled to the second extension. + """ + + self._record( + name = "skill-a", install_path = "/shared/path", + resolved_commit_sha = "aaa" + ) + self._record( + name = "skill-b", install_path = "/shared/path", + resolved_commit_sha = "bbb" + ) + + rows = db.list_installations(database_path = self.db) + shared = [r for r in rows if r["install_path"] == "/shared/path"] + self.assertEqual( + sorted(r["name"] for r in shared), ["skill-a", "skill-b"] + ) + for row in shared: + self.assertEqual(row["fetch_count"], 1) + + + def test_derived_first_last_and_latest_sha(self) -> None: + """ + The read summary must derive first-fetched, last-updated, and + the latest commit SHA from the success events. + """ + + self._record(resolved_commit_sha = "aaa") + self._record(resolved_commit_sha = "bbb") + + rows = db.list_installations(database_path = self.db) + self.assertEqual(len(rows), 1) + row = rows[0] + self.assertEqual(row["fetch_count"], 2) + self.assertEqual(row["resolved_commit_sha"], "bbb") + self.assertIsNotNone(row["first_fetched_at"]) + self.assertIsNotNone(row["last_updated_at"]) + self.assertEqual(row["last_status"], "success") + + + def test_failure_event_recorded_without_derived_timestamps(self) -> None: + """ + A failed-only installation must record the error yet expose no + first-fetched or last-updated timestamp. + """ + + self._record( + name = "broken", install_path = "/tmp/broken", + status = "failed", error = "HTTP 404" + ) + + rows = db.list_installations(database_path = self.db) + match = [r for r in rows if r["name"] == "broken"] + self.assertEqual(len(match), 1) + self.assertEqual(match[0]["last_status"], "failed") + self.assertIsNone(match[0]["first_fetched_at"]) + self.assertIsNone(match[0]["resolved_commit_sha"]) + + + def test_get_installation_includes_event_history(self) -> None: + """ + The detailed view must return one entry per matching + installation and embed its chronological event list. + """ + + self._record(resolved_commit_sha = "aaa") + self._record(resolved_commit_sha = "bbb") + + details = db.get_installation( + "sql-code-format", database_path = self.db + ) + self.assertEqual(len(details), 1) + self.assertEqual(len(details[0]["events"]), 2) + self.assertEqual( + details[0]["events"][0]["resolved_commit_sha"], "aaa" + ) + + + def test_invocation_context_persisted_on_event(self) -> None: + """ + The active invocation context must be stamped onto the event. + """ + + db_tracker.set_invocation_context("cli") + self._record() + + details = db.get_installation( + "sql-code-format", database_path = self.db + ) + self.assertEqual(details[0]["events"][0]["invoked_via"], "cli") + + + def test_record_fetch_is_noop_without_sqlalchemy(self) -> None: + """ + With SQLAlchemy reported absent, recording must degrade to a + silent no-op returning ``None`` instead of raising. + """ + + with mock.patch.object( + db_tracker, "SQLALCHEMY_AVAILABLE", False + ): + result = self._record() + self.assertIsNone(result) + + +class TestTrackingCliArguments(unittest.TestCase): + """ + Hermetic argparse-construction tests for the tracking-related CLI + surface. These never invoke :func:`cli.main` and touch no database. + """ + + def test_manager_exposes_no_tracking_flag(self) -> None: + """``--no-tracking`` must parse to ``True`` on the manager.""" + + parser = cli.buildParser() + args = parser.parse_args([ + "manager", "https://github.com/PyUtility/polyskills", + "--name", "sql-code-format", "--no-tracking", "skills" + ]) + self.assertTrue(args.no_tracking) + + + def test_no_tracking_defaults_to_false(self) -> None: + """Omitting the flag must leave tracking enabled.""" + + parser = cli.buildParser() + args = parser.parse_args([ + "manager", "https://github.com/PyUtility/polyskills", + "--name", "sql-code-format", "skills" + ]) + self.assertFalse(args.no_tracking) + + + def test_records_subcommand_parses_name_and_db(self) -> None: + """The ``records`` subcommand must accept ``--name`` / ``--db``.""" + + parser = cli.buildParser() + args = parser.parse_args([ + "records", "--name", "sql-code-format", "--db", "/tmp/x.db" + ]) + self.assertEqual(args.command, "records") + self.assertEqual(args.name, "sql-code-format") + self.assertEqual(args.db, "/tmp/x.db") + + + def test_records_subcommand_defaults(self) -> None: + """Bare ``records`` must default ``--name`` and ``--db`` to None.""" + + parser = cli.buildParser() + args = parser.parse_args(["records"]) + self.assertEqual(args.command, "records") + self.assertIsNone(args.name) + self.assertIsNone(args.db) + + +if __name__ == "__main__": + unittest.main(verbosity = 2) diff --git a/polyskills/tests/test_security.py b/polyskills/tests/test_security.py new file mode 100644 index 0000000..9b74bee --- /dev/null +++ b/polyskills/tests/test_security.py @@ -0,0 +1,356 @@ +# -*- encoding: utf-8 -*- + +""" +Hermetic Security Regression Tests for the Remote Pipeline +---------------------------------------------------------- + +Offline tests that exercise the security-critical behaviour of +:mod:`polyskills.remote.sources` and :func:`polyskills.cli._resolveVerify` +without touching the network. Every remote ``requests.get`` call is +mocked and archive fixtures are synthesised in memory, so the suite runs +deterministically in CI and locally behind a proxy. + +The cases lock in the guarantees added by the security hardening pass: +TLS verification is forwarded (never silently disabled), crafted archive +members cannot escape the destination directory, downloads and +extraction are size-bounded, and tag pagination is capped. +""" + +import io +import os +import shutil +import tarfile +import tempfile +import unittest + +from pathlib import Path +from typing import Dict, Optional +from unittest import mock + +from polyskills import cli +from polyskills.remote import sources as sources_mod +from polyskills.remote.sources import GithubManager, SourceControl +from polyskills.error.exceptions import ( + ExtractionError, RemoteError, ValidationError +) + +# ? a fixed remote URL that satisfies GithubManager.remotePattern so the +# ? slug extraction succeeds before the mocked download is reached +_REMOTE : str = "https://github.com/owner/repo" + +# ? the synthetic archive root mirrors GitHub's `--` +_ROOT : str = "owner-repo-deadbeef" + + +def _make_tarball( + members : Dict[str, bytes], root : str = _ROOT +) -> bytes: + """ + Build an in-memory ``.tar.gz`` whose every entry is nested under a + single ``root`` directory, mimicking the layout of a GitHub source + tarball. Keys are archive-relative POSIX paths and values are the + raw file contents. + + :type members: Dict[str, bytes] + :param members: Mapping of archive-relative path to file bytes; the + ``root`` prefix is added automatically to each entry. + + :type root: str + :param root: Top-level directory name injected ahead of every + member, defaulting to :data:`_ROOT`. + + :rtype: bytes + :returns: The gzip-compressed tar archive as a byte string. + """ + + buffer = io.BytesIO() + with tarfile.open(fileobj = buffer, mode = "w:gz") as archive: + for name, content in members.items(): + info = tarfile.TarInfo(f"{root}/{name}") + info.size = len(content) + archive.addfile(info, io.BytesIO(content)) + + return buffer.getvalue() + + +class _FakeResponse: + """ + Minimal stand-in for a non-streaming :class:`requests.Response` + used by ``_getTags`` and ``_listExtensions``. + + :type payload: Optional[object] + :param payload: Value returned by :meth:`json`. + + :type links: Optional[dict] + :param links: Value exposed as the ``links`` attribute for + pagination traversal. + """ + + def __init__( + self, payload : Optional[object] = None, + links : Optional[dict] = None + ) -> None: + self._payload = payload if payload is not None else [] + self.links = links if links is not None else {} + + def raise_for_status(self) -> None: + return None + + def json(self) -> object: + return self._payload + + +class _FakeStream: + """ + Streaming :class:`requests.Response` stand-in usable as a context + manager, yielding a fixed archive payload from :meth:`iter_content`. + + :type payload: bytes + :param payload: Raw bytes streamed back to the download loop. + """ + + def __init__(self, payload : bytes) -> None: + self._payload = payload + + def __enter__(self) -> "_FakeStream": + return self + + def __exit__(self, *exc : object) -> bool: + return False + + def raise_for_status(self) -> None: + return None + + def iter_content(self, chunk_size : int = 1): + for idx in range(0, len(self._payload), chunk_size): + yield self._payload[idx : idx + chunk_size] + + +class TestVerifyResolution(unittest.TestCase): + """ + Unit tests for :func:`polyskills.cli._resolveVerify`, the single + point that maps the ``--no-verify`` / ``--request-cert`` flags onto + the ``verify`` value forwarded to :mod:`requests`. + """ + + def test_default_is_secure(self) -> None: + """ + With neither flag the resolver must return ``True`` so requests + verify against the standard trust store. + """ + + self.assertIs(cli._resolveVerify(False, False), True) + + def test_no_verify_disables(self) -> None: + """``--no-verify`` must resolve to ``False``.""" + + self.assertIs(cli._resolveVerify(True, False), False) + + def test_request_cert_pins_certifi_bundle(self) -> None: + """ + ``--request-cert`` must resolve to the bundled certifi CA path + for strict verification. + """ + + import certifi + + self.assertEqual( + cli._resolveVerify(False, True), certifi.where() + ) + + def test_conflicting_flags_raise(self) -> None: + """ + Requesting both flags together must raise + :class:`ValidationError` (also a :class:`ValueError`). + """ + + with self.assertRaises(ValidationError): + cli._resolveVerify(True, True) + + +class TestVerifyForwarding(unittest.TestCase): + """ + Guards that the configured ``verify`` value reaches every + :func:`requests.get` call rather than being silently overridden. + """ + + def test_tags_request_forwards_verify_false(self) -> None: + """ + A manager built with ``verify=False`` must pass that value to + the underlying tags request. + """ + + manager = GithubManager(SourceControl(verify = False)) + with mock.patch.object( + sources_mod.requests, "get", + return_value = _FakeResponse() + ) as fake_get: + manager.get(_REMOTE, mode = "tags") + + self.assertIs(fake_get.call_args.kwargs["verify"], False) + + def test_tags_request_forwards_verify_true(self) -> None: + """ + The secure default (``verify=True``) must likewise be forwarded + verbatim to the tags request. + """ + + manager = GithubManager(SourceControl()) + with mock.patch.object( + sources_mod.requests, "get", + return_value = _FakeResponse() + ) as fake_get: + manager.get(_REMOTE, mode = "tags") + + self.assertIs(fake_get.call_args.kwargs["verify"], True) + + +class TestPaginationCap(unittest.TestCase): + """ + Ensures :meth:`GithubManager._getTags` cannot be trapped in an + endless ``next`` link chain served by a hostile remote. + """ + + def test_endless_pagination_raises_remote_error(self) -> None: + """ + When every response advertises a ``next`` link, the loop must + abort with :class:`RemoteError` once ``_MAX_PAGES`` is reached. + """ + + endless = _FakeResponse( + payload = [], links = {"next": {"url": "https://api/next"}} + ) + with mock.patch.object(sources_mod, "_MAX_PAGES", 3), \ + mock.patch.object( + sources_mod.requests, "get", return_value = endless + ): + with self.assertRaises(RemoteError): + GithubManager().get(_REMOTE, mode = "tags") + + +class TestArchiveExtractionSafety(unittest.TestCase): + """ + Offline extraction tests for :meth:`GithubManager._getExtensions` + covering the happy path and each safety guard, driven by in-memory + archive fixtures and a mocked streaming download. + """ + + def setUp(self) -> None: + """Allocate an isolated staging directory per test.""" + + self.staging = Path( + tempfile.mkdtemp(prefix = "polyskills-sec-") + ) + self.destination = self.staging / "out" + + def tearDown(self) -> None: + """Remove the staging directory created in :meth:`setUp`.""" + + shutil.rmtree(self.staging, ignore_errors = True) + + def _extract(self, payload : bytes, name : str = "skill") -> None: + """ + Drive :meth:`GithubManager.get` in ``extensions`` mode with the + download mocked to stream ``payload``. + + :type payload: bytes + :param payload: The archive bytes returned by the mocked GET. + + :type name: str + :param name: Extension (leaf directory) name to extract. + """ + + with mock.patch.object( + sources_mod.requests, "get", + return_value = _FakeStream(payload) + ): + GithubManager().get( + _REMOTE, mode = "extensions", name = name, + library = "skills", source = "extensions/skills", + destination = self.destination + ) + + def test_happy_path_extracts_files(self) -> None: + """ + A well-formed archive must extract every member under the + target directory into the destination, preserving sub-paths. + """ + + payload = _make_tarball({ + "extensions/skills/skill/SKILL.md": b"---\nname: x\n---\n", + "extensions/skills/skill/ref/general.md": b"hello", + }) + + self._extract(payload) + + self.assertTrue((self.destination / "SKILL.md").is_file()) + self.assertEqual( + (self.destination / "ref" / "general.md").read_bytes(), + b"hello" + ) + + def test_path_traversal_member_is_rejected(self) -> None: + """ + A crafted member carrying ``..`` segments that escape the + destination must raise :class:`ExtractionError` and must not + write the malicious file anywhere outside the destination. + """ + + payload = _make_tarball({ + "extensions/skills/skill/SKILL.md": b"ok", + "extensions/skills/skill/../../../../escape.txt": b"pwned", + }) + + with self.assertRaises(ExtractionError): + self._extract(payload) + + leaked = self.staging.parent / "escape.txt" + self.assertFalse( + leaked.exists(), f"path traversal leaked file to {leaked}" + ) + + def test_download_exceeding_size_cap_is_rejected(self) -> None: + """ + A download larger than ``_MAX_ARCHIVE_BYTES`` must abort with + :class:`ExtractionError` before extraction begins. + """ + + payload = _make_tarball({ + "extensions/skills/skill/SKILL.md": b"x" * 64, + }) + + with mock.patch.object(sources_mod, "_MAX_ARCHIVE_BYTES", 8): + with self.assertRaises(ExtractionError): + self._extract(payload) + + def test_extraction_exceeding_size_cap_is_rejected(self) -> None: + """ + When the cumulative uncompressed size exceeds + ``_MAX_EXTRACT_BYTES`` the extraction must abort with + :class:`ExtractionError`. + """ + + payload = _make_tarball({ + "extensions/skills/skill/SKILL.md": b"x" * 256, + }) + + with mock.patch.object(sources_mod, "_MAX_EXTRACT_BYTES", 16): + with self.assertRaises(ExtractionError): + self._extract(payload) + + def test_missing_extension_raises_not_found(self) -> None: + """ + An archive that contains no entry under the requested target + must raise :class:`FileNotFoundError`. + """ + + payload = _make_tarball({ + "extensions/skills/other/SKILL.md": b"ok", + }) + + with self.assertRaises(FileNotFoundError): + self._extract(payload, name = "skill") + + +if __name__ == "__main__": + unittest.main(verbosity = 2) diff --git a/pyproject.toml b/pyproject.toml index 368d180..30cfb19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ classifiers = [ dependencies = [ "requests>=2.31", - "packaging>=23.0", + "sqlalchemy>=2.0", ] [project.scripts]