Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:

- name: Install dependencies
run: |
pip install vcrpy pytest==7.4.2 requests pytest-mock python-documentcloud pytest-xdist pytest-recording python-squarelet
pip install vcrpy pytest==7.4.2 requests token-bucket pytest-mock python-documentcloud pytest-xdist pytest-recording python-squarelet

- name: Run pre-recorded tests
run: |
Expand All @@ -45,7 +45,7 @@ jobs:

- name: Install dependencies for imports
run: |
pip install python-dateutil requests urllib3 fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet
pip install python-dateutil requests urllib3 token-bucket fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet

- name: Install pylint and black
run: |
Expand Down
47 changes: 45 additions & 2 deletions documentcloud/client.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,35 @@
# Import SquareletClient from python-squarelet
# Standard Library
import logging
import time

# Third Party
import token_bucket
from squarelet import SquareletClient

# Local
# Local Imports
from .documents import DocumentClient
from .organizations import OrganizationClient
from .projects import ProjectClient
from .users import UserClient

logger = logging.getLogger("documentcloud")

# Per-endpoint rate limits applied on top of the global squarelet limit.
# Format: (method, url_pattern, rate_per_second, capacity)
#
# Endpoint Rate Burst Notes
# -------- ---- ----- -----
# GET documents/search 15/min 50
# POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
# PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
# GET files/ 15/min 100 PDFs, full text, and other private assets
ENDPOINT_RATE_LIMITS = [
("GET", "documents/search", 15 / 60, 50),
("POST", "documents/", 12 / 60, 100),
("PUT", "documents/", 12 / 60, 100),
("GET", "files/", 15 / 60, 100),
]


class DocumentCloud(SquareletClient):
"""
Expand Down Expand Up @@ -51,8 +67,35 @@ def __init__(
else:
logger.addHandler(logging.NullHandler())

# Build per-endpoint token bucket rate limiters
storage = token_bucket.MemoryStorage()
self._endpoint_limiters = [
(
pattern_method,
pattern,
token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage),
f"{pattern_method}:{pattern}",
)
for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS
]

# Initialize the sub-clients using SquareletClient
self.documents = DocumentClient(self)
self.projects = ProjectClient(self)
self.users = UserClient(self)
self.organizations = OrganizationClient(self)

def _base_request(
self, method, url, raise_error=True, **kwargs
): # pylint: disable=unused-argument
return super().request(method, url, raise_error=raise_error, **kwargs)

def request(self, method, url, raise_error=True, **kwargs):
for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters:
if pattern_method.upper() == method.upper() and pattern in url:
while not limiter.consume(bucket_key):
time.sleep(0.1)
return self._base_request(
method, url, raise_error=raise_error, **kwargs
)
return super().request(method, url, raise_error=raise_error, **kwargs)
28 changes: 25 additions & 3 deletions documentcloud/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import logging
import os
import re
import time
import warnings
from functools import partial
from urllib.parse import urlparse

# Third Party
import token_bucket
from requests.exceptions import RequestException

# Local
Expand All @@ -28,6 +30,23 @@

IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]

# Rate limit for public document asset fetches (S3-hosted).
# Private document assets go through the API client and are limited there.
# Token bucket: burst of 100, sustained at 15/min (0.25/sec).
_asset_storage = token_bucket.MemoryStorage()
_asset_limiter = token_bucket.Limiter(
rate=15 / 60, # 0.25 tokens per second = 15 per minute
capacity=100, # burst of 100
storage=_asset_storage,
)
_asset_session = requests_retry_session()


def _asset_get(url, **kwargs):
while not _asset_limiter.consume("asset"):
time.sleep(0.1)
return _asset_session.get(url, **kwargs)


class Document(BaseAPIObject):
"""A single DocumentCloud document"""
Expand Down Expand Up @@ -164,12 +183,15 @@ def _get_url(self, url, fmt=None):

if base_netloc == url_netloc:
# if the url host is the same as the base api host,
# sent the request with the client in order to include
# send the request with the client in order to include
# authentication credentials
response = self._client.get(url, full_url=True)
else:
response = requests_retry_session().get(
url, headers={"User-Agent": "python-documentcloud2"}
response = _asset_get(
url,
headers={
"User-Agent": self._client.session.headers.get("User-Agent", "")
},
)
if fmt == "text":
return response.content.decode("utf8")
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"pyyaml",
"fastjsonschema",
"python-squarelet",
"token-bucket"
),
extras_require={
"dev": [
Expand Down