diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml index e34754827daa..cf24159cb8d8 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml +++ b/airbyte-integrations/connectors/source-azure-blob-storage/metadata.yaml @@ -4,6 +4,12 @@ data: - "${azure_blob_storage_account_name}.blob.core.windows.net" - "${azure_blob_storage_endpoint}" - "login.microsoftonline.com/${credentials.tenant_id}/oauth2/v2.0/token" + resourceRequirements: + jobSpecific: + - jobType: check_connection + resourceRequirements: + memory_limit: 4096Mi + memory_request: 4096Mi ab_internal: ql: 400 sl: 300 @@ -12,7 +18,7 @@ data: connectorSubtype: file connectorType: source definitionId: fdaaba68-4875-4ed9-8fcd-4ae1e0a25093 - dockerImageTag: 0.8.5 + dockerImageTag: 0.8.6 dockerRepository: airbyte/source-azure-blob-storage documentationUrl: https://docs.airbyte.com/integrations/sources/azure-blob-storage externalDocumentationUrls: diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml b/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml index d6f7cb576653..754378d1c74f 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml +++ b/airbyte-integrations/connectors/source-azure-blob-storage/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",] build-backend = "poetry.core.masonry.api" [tool.poetry] -version = "0.8.5" +version = "0.8.6" name = "source-azure-blob-storage" description = "Source implementation for Azure Blob Storage." authors = [ "Airbyte ",] diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py index a70acf8ce11a..6d38e3e6a395 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/spec.py @@ -3,10 +3,11 @@ # +from datetime import datetime from typing import Any, Dict, Literal, Optional, Union import dpath.util -from pydantic.v1 import AnyUrl, BaseModel, Field +from pydantic.v1 import AnyUrl, BaseModel, Field, validator from airbyte_cdk import OneOfOptionConfig from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec, DeliverRawFiles, DeliverRecords @@ -107,6 +108,17 @@ def documentation_url(cls) -> AnyUrl: order=11, ) + end_date: Optional[str] = Field( + title="End Date", + description="UTC date and time in the format 2017-01-25T00:00:00.000000Z. Any file modified after this date will not be replicated.", + examples=["2021-01-01T00:00:00.000000Z"], + format="date-time", + pattern="^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{6}Z$", + pattern_descriptor="YYYY-MM-DDTHH:mm:ss.SSSSSSZ", + order=12, + default=None, + ) + delivery_method: Union[DeliverRecords, DeliverRawFiles] = Field( title="Delivery Method", discriminator="delivery_type", @@ -118,6 +130,19 @@ def documentation_url(cls) -> AnyUrl: airbyte_hidden=True, ) + @validator("end_date", allow_reuse=True) + def validate_end_date(cls, value: Optional[str], values: Dict[str, Any]) -> Optional[str]: + if value is None: + return None + start_date = values.get("start_date") + if start_date is None: + return value + start = datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S.%fZ") + end = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ") + if end < start: + raise ValueError("End date must be after start date") + return value + @classmethod def schema(cls, *args: Any, **kwargs: Any) -> Dict[str, Any]: """ diff --git a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py index 3ff6325f40ab..fe7a7e7d76ee 100644 --- a/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py +++ b/airbyte-integrations/connectors/source-azure-blob-storage/source_azure_blob_storage/stream_reader.py @@ -2,6 +2,7 @@ import logging +from datetime import datetime from io import IOBase from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union @@ -150,16 +151,26 @@ def get_matching_files( ) -> Iterable[AzureBlobStorageUploadableRemoteFile]: prefixes = [prefix] if prefix else self.get_prefixes_from_globs(globs) prefixes = prefixes or [None] + + end_date = None + if self.config and self.config.end_date: + end_date = datetime.strptime(self.config.end_date, self.DATE_TIME_FORMAT) + try: for prefix in prefixes: for blob in self.azure_container_client.list_blobs(name_starts_with=prefix): + last_modified = blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None) + + if end_date and last_modified > end_date: + continue + remote_file = AzureBlobStorageUploadableRemoteFile( uri=blob.name, - last_modified=blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None), + last_modified=last_modified, blob_client=self.azure_blob_service_client, blob_properties=blob, created_at=blob.creation_time.astimezone(pytz.utc).replace(tzinfo=None).strftime("%Y-%m-%dT%H:%M:%S.%fZ"), - updated_at=blob.last_modified.astimezone(pytz.utc).replace(tzinfo=None).strftime("%Y-%m-%dT%H:%M:%S.%fZ"), + updated_at=last_modified.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), ) yield from self.filter_files_by_globs_and_start_date([remote_file], globs) except ResourceNotFoundError as e: diff --git a/docs/integrations/sources/azure-blob-storage.md b/docs/integrations/sources/azure-blob-storage.md index 0ee70e4edcd3..181129c7a8c7 100644 --- a/docs/integrations/sources/azure-blob-storage.md +++ b/docs/integrations/sources/azure-blob-storage.md @@ -98,6 +98,7 @@ In the `Manage / Certificates & secrets` panel, click `Client Secrets` and creat 5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below. 10. (Optional) Enter the endpoint to use for the data replication. 11. (Optional) Enter the desired start date from which to begin replicating data. +12. (Optional) Enter the desired end date to stop replicating data. This is useful for breaking down large backfills into smaller date ranges. @@ -123,6 +124,7 @@ In the `Manage / Certificates & secrets` panel, click `Client Secrets` and creat 5. Optionally, enter the **Globs** which dictates which files to be synced. This is a regular expression that allows Airbyte to pattern match the specific files to replicate. If you are replicating all the files within your bucket, use `**` as the pattern. For more precise pattern matching options, refer to the [Path Patterns section](#path-patterns) below. 10. (Optional) Enter the endpoint to use for the data replication. 11. (Optional) Enter the desired start date from which to begin replicating data. +12. (Optional) Enter the desired end date to stop replicating data. This is useful for breaking down large backfills into smaller date ranges. ## Supported sync modes @@ -301,6 +303,7 @@ The Azure Blob Storage connector should not encounter any [Microsoft API limitat | Version | Date | Pull Request | Subject | |:-----------|:-----------|:---------------------------------------------------------|:---------------------------------------------------------------------------------------------| +| 0.8.6 | 2025-11-28 | [70246](https://github.com/airbytehq/airbyte/pull/70246) | Increase memory for check operation to 4096Mi | | 0.8.5 | 2025-11-25 | [69910](https://github.com/airbytehq/airbyte/pull/69910) | Update dependencies | | 0.8.4 | 2025-11-18 | [69579](https://github.com/airbytehq/airbyte/pull/69579) | Update dependencies | | 0.8.3 | 2025-11-11 | [69269](https://github.com/airbytehq/airbyte/pull/69269) | Update dependencies |