diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8452afb..e818a7e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,4 @@ name: Run Pre-recorded Tests - on: pull_request: branches: @@ -7,7 +6,6 @@ on: push: branches: - master - jobs: run-tests: runs-on: ubuntu-latest @@ -17,40 +15,26 @@ jobs: steps: - name: Check out code uses: actions/checkout@v6 - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install vcrpy pytest==7.4.2 requests pytest-mock python-documentcloud pytest-xdist pytest-recording python-squarelet - + run: pip install -e ".[test]" - name: Run pre-recorded tests - run: | - make test - working-directory: . - + run: make test pylint-and-black: runs-on: ubuntu-latest steps: - name: Check out code uses: actions/checkout@v6 - - name: Set up Python 3.11 uses: actions/setup-python@v6 with: python-version: "3.11" - - - name: Install dependencies for imports - run: | - pip install python-dateutil requests urllib3 fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet - - - name: Install pylint and black - run: | - pip install pylint black - + - name: Install dependencies + run: pip install -e ".[dev,test]" - name: Run pylint and black on ./documentcloud and ./tests run: | - pylint ./documentcloud ./tests; black --check ./documentcloud ./tests + pylint ./documentcloud ./tests + black --check ./documentcloud ./tests \ No newline at end of file diff --git a/docs/changelog.rst b/docs/changelog.rst index 8cea322..8520b34 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,10 @@ Changelog --------- +4.7.0 +~~~~~ +* Added burst-based sane rate limits to several endpoints. + 4.6.0 ~~~~~ * Added load_run_data and store_run_data on the Add-On class to access AddOn run data. diff --git a/docs/conf.py b/docs/conf.py index 7c76388..e80e36c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = "4.6" +version = "4.7" # The full version, including alpha/beta/rc tags. -release = "4.6.0" +release = "4.7.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/documentcloud/client.py b/documentcloud/client.py index 1acee53..04ae838 100644 --- a/documentcloud/client.py +++ b/documentcloud/client.py @@ -1,12 +1,12 @@ -# Import SquareletClient from python-squarelet # Standard Library import logging +import time # Third Party +import token_bucket from squarelet import SquareletClient # Local -# Local Imports from .documents import DocumentClient from .organizations import OrganizationClient from .projects import ProjectClient @@ -14,6 +14,22 @@ logger = logging.getLogger("documentcloud") +# Per-endpoint rate limits applied on top of the global squarelet limit. +# Format: (method, url_pattern, rate_per_second, capacity) +# +# Endpoint Rate Burst Notes +# -------- ---- ----- ----- +# GET documents/search 15/min 50 +# POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min +# PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min +# GET files/ 15/min 100 PDFs, full text, and other private assets +ENDPOINT_RATE_LIMITS = [ + ("GET", "documents/search", 15 / 60, 50), + ("POST", "documents/", 12 / 60, 100), + ("PUT", "documents/", 12 / 60, 100), + ("GET", "files/", 15 / 60, 100), +] + class DocumentCloud(SquareletClient): """ @@ -51,8 +67,34 @@ def __init__( else: logger.addHandler(logging.NullHandler()) + # Build per-endpoint token bucket rate limiters + storage = token_bucket.MemoryStorage() + self._endpoint_limiters = [ + ( + pattern_method, + pattern, + token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage), + f"{pattern_method}:{pattern}", + ) + for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS + ] + # Initialize the sub-clients using SquareletClient self.documents = DocumentClient(self) self.projects = ProjectClient(self) self.users = UserClient(self) self.organizations = OrganizationClient(self) + + def request(self, method, url, raise_error=True, **kwargs): + for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters: + if pattern_method.upper() == method.upper() and pattern in url: + if not limiter.consume(bucket_key): + logger.warning( + "Rate limit reached for %s %s, throttling...", + method.upper(), + pattern, + ) + while not limiter.consume(bucket_key): + time.sleep(0.1) + return super().request(method, url, raise_error=raise_error, **kwargs) + return super().request(method, url, raise_error=raise_error, **kwargs) diff --git a/documentcloud/documents.py b/documentcloud/documents.py index d126f9d..0a7dd65 100644 --- a/documentcloud/documents.py +++ b/documentcloud/documents.py @@ -7,11 +7,13 @@ import logging import os import re +import time import warnings from functools import partial from urllib.parse import urlparse # Third Party +import token_bucket from requests.exceptions import RequestException # Local @@ -28,6 +30,8 @@ IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"] +DEFAULT_USER_AGENT = "python-documentcloud" + class Document(BaseAPIObject): """A single DocumentCloud document""" @@ -164,12 +168,17 @@ def _get_url(self, url, fmt=None): if base_netloc == url_netloc: # if the url host is the same as the base api host, - # sent the request with the client in order to include + # send the request with the client in order to include # authentication credentials response = self._client.get(url, full_url=True) else: - response = requests_retry_session().get( - url, headers={"User-Agent": "python-documentcloud2"} + response = self._client.documents.asset_get( + url, + headers={ + "User-Agent": self._client.session.headers.get( + "User-Agent", DEFAULT_USER_AGENT + ) + }, ) if fmt == "text": return response.content.decode("utf8") @@ -246,6 +255,26 @@ class DocumentClient(BaseAPIClient): api_path = "documents" resource = Document + def __init__(self, client): + super().__init__(client) + # Rate limit for public document asset fetches (S3-hosted). + # Private document assets go through the API client and are limited there. + # Token bucket: burst of 100, sustained at 15/min (0.25/sec). + storage = token_bucket.MemoryStorage() + self._asset_limiter = token_bucket.Limiter( + rate=15 / 60, + capacity=100, + storage=storage, + ) + self._asset_session = requests_retry_session() + + def asset_get(self, url, **kwargs): + if not self._asset_limiter.consume("asset"): + logger.warning("Rate limit reached for asset fetch, throttling...") + while not self._asset_limiter.consume("asset"): + time.sleep(0.1) + return self._asset_session.get(url, **kwargs) + def search(self, query, **params): """Return documents matching a search query""" diff --git a/setup.py b/setup.py index a76b8ed..fb58793 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="python-documentcloud", - version="4.6.0", + version="4.7.0", description="A simple Python wrapper for the DocumentCloud API", author="Mitchell Kotler", author_email="mitch@muckrock.com", @@ -27,6 +27,7 @@ "pyyaml", "fastjsonschema", "python-squarelet", + "token-bucket", ), extras_require={ "dev": [ @@ -40,6 +41,7 @@ "test": [ "pytest", "pytest-mock", + "pytest-xdist", "pytest-recording", "vcrpy", ], diff --git a/tests/test_client.py b/tests/test_client.py index 7915f05..76e8bf3 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -9,6 +9,7 @@ import ratelimit # DocumentCloud +from documentcloud import DocumentCloud from documentcloud.constants import RATE_LIMIT from documentcloud.exceptions import APIError, CredentialsFailedError @@ -111,3 +112,77 @@ def test_expired_refresh_token(short_client, record_mode): assert short_client.users.get("me") # check the refresh token was updated assert old_refresh_token != short_client.refresh_token + + +def test_endpoint_rate_limit_burst_exhaustion(): + """Token bucket should block after burst capacity is exhausted""" + client = DocumentCloud() + # Exhaust the search burst (capacity=50) + _pattern_method, _pattern, limiter, bucket_key = client._endpoint_limiters[0] + for _ in range(50): + limiter.consume(bucket_key) + assert not limiter.consume(bucket_key) + + +def test_endpoint_rate_limit_method_specificity(): + """GET and POST to documents/ should use different limiters""" + client = DocumentCloud() + limiters = {(pm, p): lim for pm, p, lim, _ in client._endpoint_limiters} + assert limiters[("GET", "files/")] is not limiters[("POST", "documents/")] + + +def test_endpoint_rate_limit_pattern_ordering(): + """documents/search should match before documents/""" + client = DocumentCloud() + url = "documents/search/" + matched = next( + p for pm, p, _, _ in client._endpoint_limiters if pm == "GET" and p in url + ) + assert matched == "documents/search" + + +def test_asset_rate_limit_burst_exhaustion(): + """Asset token bucket should block after burst capacity is exhausted""" + client = DocumentCloud() + limiter = client.documents._asset_limiter + for _ in range(100): + limiter.consume("asset") + assert not limiter.consume("asset") + + +def test_asset_rate_limit_refills(): + """Asset token bucket should refill over time""" + client = DocumentCloud() + limiter = client.documents._asset_limiter + for _ in range(100): + limiter.consume("asset") + assert not limiter.consume("asset") + time.sleep(5) + assert limiter.consume("asset") + + +def test_endpoint_rate_limit_buckets_are_independent(): + """Exhausting one endpoint's bucket should not affect another""" + client = DocumentCloud() + limiters = {(pm, p): (lim, bk) for pm, p, lim, bk in client._endpoint_limiters} + search_limiter, search_key = limiters[("GET", "documents/search")] + files_limiter, files_key = limiters[("GET", "files/")] + + # Exhaust search bucket + for _ in range(50): + search_limiter.consume(search_key) + assert not search_limiter.consume(search_key) + + # Files bucket should still have tokens + assert files_limiter.consume(files_key) + + +def test_endpoint_rate_limit_no_match_for_unrecognized_url(): + """Unrecognized URLs should not match any endpoint limiter""" + client = DocumentCloud() + url = "users/me/" + matched = next( + (p for pm, p, _, _ in client._endpoint_limiters if p in url), + None, + ) + assert matched is None