diff --git a/pyiceberg/catalog/rest/auth.py b/pyiceberg/catalog/rest/auth.py index 602074282c..9889bc8004 100644 --- a/pyiceberg/catalog/rest/auth.py +++ b/pyiceberg/catalog/rest/auth.py @@ -123,7 +123,9 @@ def _refresh_token(self) -> None: if self._credential is not None: self._token = self._fetch_access_token(self._credential) - def auth_header(self) -> str: + def auth_header(self) -> str | None: + if self._token is None: + return None return f"Bearer {self._token}" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index 7749268ff5..4768a914c5 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -97,7 +97,7 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - from botocore.awsrequest import AWSRequest + from botocore.awsrequest import AWSPreparedRequest, AWSRequest class S3RequestSigner(abc.ABC): @@ -122,7 +122,7 @@ def __init__(self, properties: Properties) -> None: super().__init__(properties) self._session = requests.Session() - def __call__(self, request: "AWSRequest", **_: Any) -> None: + def __call__(self, request: "AWSPreparedRequest", **_: Any) -> None: signer_url = self.properties.get(S3_SIGNER_URI, self.properties[URI]).rstrip("/") # type: ignore signer_endpoint = self.properties.get(S3_SIGNER_ENDPOINT, S3_SIGNER_ENDPOINT_DEFAULT) @@ -154,7 +154,9 @@ def __call__(self, request: "AWSRequest", **_: Any) -> None: raise SignError(f"Failed to sign request {response.status_code}: {signer_body}") from e for key, value in response_json["headers"].items(): - request.headers.add_header(key, ", ".join(value)) + # Use dict-style assignment compatible with both AWSPreparedRequest (before-send) + # and AWSRequest (before-sign), and to replace rather than append duplicate headers. + request.headers[key] = ", ".join(value) request.url = response_json["uri"] @@ -183,9 +185,13 @@ def _s3(properties: Properties) -> AbstractFileSystem: logger.info("Loading signer %s", signer) if signer_cls := SIGNERS.get(signer): signer = signer_cls(properties) - register_events["before-sign.s3"] = signer + # Register on before-send (not before-sign) so the handler fires even when + # signature_version=UNSIGNED — botocore short-circuits RequestSigner.sign() + # before emitting before-sign when UNSIGNED is set, so the signer would + # never be called. before-send fires unconditionally, after signing. + register_events["before-send.s3"] = signer - # Disable the AWS Signer + # Disable botocore's own SigV4 signing; the REST signer adds its own headers. from botocore import UNSIGNED config_kwargs["signature_version"] = UNSIGNED diff --git a/tests/catalog/test_rest_auth.py b/tests/catalog/test_rest_auth.py index ae5d40f5aa..a3f2097290 100644 --- a/tests/catalog/test_rest_auth.py +++ b/tests/catalog/test_rest_auth.py @@ -22,7 +22,14 @@ import requests from requests_mock import Mocker -from pyiceberg.catalog.rest.auth import AuthManagerAdapter, BasicAuthManager, EntraAuthManager, GoogleAuthManager, NoopAuthManager +from pyiceberg.catalog.rest.auth import ( + AuthManagerAdapter, + BasicAuthManager, + EntraAuthManager, + GoogleAuthManager, + LegacyOAuth2AuthManager, + NoopAuthManager, +) TEST_URI = "https://iceberg-test-catalog/" GOOGLE_CREDS_URI = "https://oauth2.googleapis.com/token" @@ -246,6 +253,19 @@ def test_entra_auth_manager_token_failure(mock_default_cred: MagicMock, rest_moc with pytest.raises(Exception, match="Failed to acquire token"): session.get(TEST_URI) - # Verify no requests were made with a blank/missing auth header - history = rest_mock.request_history - assert len(history) == 0 + +def test_legacy_oauth2_auth_header_returns_none_when_no_token() -> None: + """LegacyOAuth2AuthManager.auth_header() must return None (not 'Bearer None') when no + credential or initial_token is provided. Returning 'Bearer None' caused S3V4RestSigner + to forward an invalid Authorization header to the catalog signer endpoint, resulting in + a 403 that was silently swallowed and the S3 request going unsigned.""" + session = requests.Session() + auth_manager = LegacyOAuth2AuthManager(session=session, credential=None, initial_token=None) + assert auth_manager.auth_header() is None + + +def test_legacy_oauth2_auth_header_returns_bearer_token_when_set() -> None: + """LegacyOAuth2AuthManager.auth_header() returns a proper Bearer token when one is present.""" + session = requests.Session() + auth_manager = LegacyOAuth2AuthManager(session=session, credential=None, initial_token="my-token") + assert auth_manager.auth_header() == "Bearer my-token" diff --git a/tests/io/test_fsspec.py b/tests/io/test_fsspec.py index 8739a5964d..01c916f067 100644 --- a/tests/io/test_fsspec.py +++ b/tests/io/test_fsspec.py @@ -23,7 +23,7 @@ from unittest import mock import pytest -from botocore.awsrequest import AWSRequest +from botocore.awsrequest import AWSPreparedRequest, AWSRequest from fsspec.implementations.local import LocalFileSystem from fsspec.spec import AbstractFileSystem from requests_mock import Mocker @@ -1105,3 +1105,70 @@ def auth_header(self) -> str: assert requests_mock.last_request is not None assert requests_mock.last_request.headers["Authorization"] == "Bearer via-manager" assert request.url == new_uri + + +def test_s3v4_rest_signer_with_prepared_request(requests_mock: Mocker) -> None: + """S3V4RestSigner must work with AWSPreparedRequest (the type passed by botocore's + before-send event) so that signing works in both sync and async (aiobotocore) paths.""" + new_uri = "https://other-bucket/data/file.parquet" + requests_mock.post( + f"{TEST_URI}/v1/aws/s3/sign", + json={ + "uri": new_uri, + "headers": { + "Authorization": ["AWS4-HMAC-SHA256 Credential=ASIA.../s3/aws4_request, Signature=abc"], + "X-Amz-Date": ["20221017T102940Z"], + }, + "extensions": {}, + }, + status_code=200, + ) + + prepared = AWSPreparedRequest( + method="PUT", + url="https://bucket/data/file.parquet", + headers={"User-Agent": "botocore/1.43"}, + body=b"", + stream_output=False, + ) + prepared.context = {"client_region": "us-east-1"} + + signer = S3V4RestSigner(properties={"token": "abc", "uri": TEST_URI}) + signer(prepared) + + assert prepared.url == new_uri + assert prepared.headers["Authorization"] == "AWS4-HMAC-SHA256 Credential=ASIA.../s3/aws4_request, Signature=abc" + assert prepared.headers["X-Amz-Date"] == "20221017T102940Z" + + +def test_s3_signer_registered_on_before_send_event() -> None: + """S3V4RestSigner must be registered on the before-send.s3 event, not before-sign.s3. + botocore short-circuits RequestSigner.sign() before emitting before-sign when + signature_version=UNSIGNED is set, so before-sign never fires and the signer is + never called — leaving the S3 request unsigned and causing AccessDenied.""" + from unittest.mock import MagicMock, patch + + properties = { + "uri": "https://catalog", + "s3.access-key-id": "key", + "s3.secret-access-key": "secret", + "s3.endpoint": "https://s3.example.com", + "s3.region": "us-east-1", + "s3.signer": "S3V4RestSigner", + } + + mock_fs = MagicMock() + mock_fs.s3.meta.events = MagicMock() + + with patch("pyiceberg.io.fsspec.S3FileSystem", return_value=mock_fs): + from pyiceberg.io.fsspec import _s3 + + _s3(properties) + + registered_calls = mock_fs.s3.meta.events.register_last.call_args_list + assert len(registered_calls) == 1 + event_name = registered_calls[0][0][0] + assert event_name == "before-send.s3", ( + f"Expected S3V4RestSigner to be registered on 'before-send.s3' but got '{event_name}'. " + "before-sign.s3 is never emitted when signature_version=UNSIGNED." + )