Skip to content

Commit 1d0890a

Browse files
authored
Support upcoming Scrapy changes (#256)
1 parent da21446 commit 1d0890a

File tree

9 files changed

+224
-176
lines changed

9 files changed

+224
-176
lines changed

CHANGES.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,27 @@
11
Changes
22
=======
33

4+
Unreleased
5+
----------
6+
7+
- Extended :doc:`Scrapy <scrapy:index>` support to :ref:`2.13.0+
8+
<scrapy:release-2.13.0>`.
9+
10+
- Switched the minimum required version of :doc:`python-zyte-api
11+
<python-zyte-api:index>` from ``0.5.1`` to ``0.6.0``.
12+
13+
- Fixed the removal of default request headers (``Accept``,
14+
``Accept-Encoding``, ``Accept-Language``, and ``User-Agent``) not working
15+
for request copies (e.g. redirects or retries).
16+
17+
- The default value of the :setting:`ZYTE_API_FALLBACK_HTTP_HANDLER` and
18+
:setting:`ZYTE_API_FALLBACK_HTTPS_HANDLER` settings is as expected even
19+
when not using the add-on.
20+
21+
- The scrapy-zyte-api download handlers now support fallback download
22+
handlers that do not define a ``close()`` method.
23+
24+
425
0.29.0 (2025-03-20)
526
-------------------
627

docs/reference/settings.rst

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,26 @@ Default: ``False``
146146
See :ref:`request-automatic`.
147147

148148

149+
.. setting:: ZYTE_API_FALLBACK_HTTP_HANDLER
150+
151+
ZYTE_API_FALLBACK_HTTP_HANDLER
152+
==============================
153+
154+
Default: :setting:`DOWNLOAD_HANDLERS["http"] <scrapy:DOWNLOAD_HANDLERS>`
155+
156+
Download handler to use for HTTP requests that do not go through Zyte API.
157+
158+
159+
.. setting:: ZYTE_API_FALLBACK_HTTPS_HANDLER
160+
161+
ZYTE_API_FALLBACK_HTTPS_HANDLER
162+
===============================
163+
164+
Default: :setting:`DOWNLOAD_HANDLERS["https"] <scrapy:DOWNLOAD_HANDLERS>`
165+
166+
Download handler to use for HTTPS requests that do not go through Zyte API.
167+
168+
149169
.. setting:: ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS
150170

151171
ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ requires-python = ">=3.9"
2727
dependencies = [
2828
"packaging>=20.0",
2929
"scrapy>=2.0.1",
30-
"zyte-api>=0.5.1",
30+
"zyte-api>=0.6.0",
3131
]
3232

3333
[project.optional-dependencies]

scrapy_zyte_api/_middlewares.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,19 @@ def __init__(self, crawler):
176176
def _get_header_set(request):
177177
return {header.strip().lower() for header in request.headers}
178178

179-
def process_start_requests(self, start_requests, spider):
179+
async def process_start(self, start):
180180
# Mark start requests and reports to the downloader middleware the
181181
# number of them once all have been processed.
182+
count = 0
183+
async for item_or_request in start:
184+
if isinstance(item_or_request, Request):
185+
count += 1
186+
item_or_request.meta["is_start_request"] = True
187+
self._process_output_request(item_or_request, None)
188+
yield item_or_request
189+
self._send_signal(_start_requests_processed, count=count)
190+
191+
def process_start_requests(self, start_requests, spider):
182192
count = 0
183193
for item_or_request in start_requests:
184194
if isinstance(item_or_request, Request):
@@ -189,7 +199,8 @@ def process_start_requests(self, start_requests, spider):
189199
self._send_signal(_start_requests_processed, count=count)
190200

191201
def _process_output_request(self, request, spider):
192-
request.meta["_pre_mw_headers"] = self._get_header_set(request)
202+
if "_pre_mw_headers" not in request.meta:
203+
request.meta["_pre_mw_headers"] = self._get_header_set(request)
193204
self.slot_request(request, spider)
194205

195206
def _process_output_item_or_request(self, item_or_request, spider):

scrapy_zyte_api/handler.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def _truncate_params(self, params):
323323

324324
@inlineCallbacks
325325
def close(self) -> Generator:
326-
if self._fallback_handler:
326+
if self._fallback_handler and hasattr(self._fallback_handler, "close"):
327327
yield self._fallback_handler.close()
328328
yield deferred_from_coro(self._close())
329329

@@ -347,7 +347,10 @@ def __init__(
347347
):
348348
super().__init__(settings, crawler, client)
349349
self._fallback_handler = self._create_handler(
350-
settings.get("ZYTE_API_FALLBACK_HTTP_HANDLER")
350+
settings.get(
351+
"ZYTE_API_FALLBACK_HTTP_HANDLER",
352+
settings.getwithbase("DOWNLOAD_HANDLERS")["http"],
353+
)
351354
)
352355

353356

@@ -357,5 +360,8 @@ def __init__(
357360
):
358361
super().__init__(settings, crawler, client)
359362
self._fallback_handler = self._create_handler(
360-
settings.get("ZYTE_API_FALLBACK_HTTPS_HANDLER")
363+
settings.get(
364+
"ZYTE_API_FALLBACK_HTTPS_HANDLER",
365+
settings.getwithbase("DOWNLOAD_HANDLERS")["https"],
366+
)
361367
)

scrapy_zyte_api/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
_SCRAPY_2_4_0 = Version("2.4.0")
1717
_SCRAPY_2_5_0 = Version("2.5.0")
1818
_SCRAPY_2_6_0 = Version("2.6.0")
19+
_SCRAPY_2_10_0 = Version("2.10.0")
1920
_SCRAPY_2_12_0 = Version("2.12.0")
21+
_SCRAPY_2_13_0 = Version("2.13.0")
2022

2123
# Need to install an asyncio reactor before download handler imports to work
2224
# around:
@@ -25,6 +27,8 @@
2527
# https://github.com/scrapy/scrapy/commit/e4bdd1cb958b7d89b86ea66f0af1cec2d91a6d44
2628
_NEEDS_EARLY_REACTOR = _SCRAPY_2_4_0 <= _SCRAPY_VERSION < _SCRAPY_2_6_0
2729

30+
_ADDON_SUPPORT = _SCRAPY_VERSION >= _SCRAPY_2_10_0
31+
_ASYNC_START_SUPPORT = _SCRAPY_VERSION >= _SCRAPY_2_13_0
2832
_AUTOTHROTTLE_DONT_ADJUST_DELAY_SUPPORT = _SCRAPY_VERSION >= _SCRAPY_2_12_0
2933
_DOWNLOAD_NEEDS_SPIDER = _SCRAPY_VERSION < _SCRAPY_2_6_0
3034
_RAW_CLASS_SETTING_SUPPORT = _SCRAPY_VERSION >= _SCRAPY_2_4_0

tests/test_addon.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pytest_twisted import ensureDeferred
55
from scrapy import Request
66
from scrapy.core.downloader.handlers.http import HTTP10DownloadHandler
7+
from scrapy.settings.default_settings import TWISTED_REACTOR
78
from scrapy.utils.test import get_crawler
89

910
from scrapy_zyte_api import (
@@ -154,11 +155,14 @@ def _test_setting_changes(initial_settings, expected_settings):
154155
ScrapyZyteAPISpiderMiddleware: 100,
155156
ScrapyZyteAPIRefererSpiderMiddleware: 1000,
156157
},
157-
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
158158
"ZYTE_API_FALLBACK_HTTPS_HANDLER": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
159159
"ZYTE_API_FALLBACK_HTTP_HANDLER": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
160160
"ZYTE_API_TRANSPARENT_MODE": True,
161161
}
162+
if TWISTED_REACTOR != "twisted.internet.asyncioreactor.AsyncioSelectorReactor":
163+
BASE_EXPECTED["TWISTED_REACTOR"] = (
164+
"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
165+
)
162166

163167

164168
@pytest.mark.skipif(

0 commit comments

Comments
 (0)