Skip to content

Commit fe819a2

Browse files
authored
Merge pull request #260 from scrapy-plugins/poetless
Do not use poet for request fingerprinting unless configured
2 parents 2df87b5 + d4d5048 commit fe819a2

File tree

2 files changed

+57
-20
lines changed

2 files changed

+57
-20
lines changed

scrapy_zyte_api/_request_fingerprinter.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,32 +32,44 @@ class ScrapyZyteAPIRequestFingerprinter:
3232
def from_crawler(cls, crawler):
3333
return cls(crawler)
3434

35-
def __init__(self, crawler):
36-
settings = crawler.settings
35+
@staticmethod
36+
def _poet_is_configured(settings):
3737
try:
38-
from scrapy_poet import ScrapyPoetRequestFingerprinter
38+
from scrapy_poet import InjectionMiddleware
3939
except ImportError:
40-
self._has_poet = False
41-
RequestFingerprinter = ScrapyRequestFingerprinter
40+
return False
41+
for k, v in settings.get("DOWNLOADER_MIDDLEWARES", {}).items():
42+
if issubclass(load_object(k), InjectionMiddleware):
43+
return v is not None
44+
return False
45+
46+
def __init__(self, crawler):
47+
settings = crawler.settings
48+
self._fallback_fingerprinter_is_poets = poet_is_configured = (
49+
self._poet_is_configured(settings)
50+
)
51+
if poet_is_configured:
52+
from scrapy_poet import (
53+
ScrapyPoetRequestFingerprinter as DefaultFallbackRequestFingerprinter,
54+
)
4255
else:
43-
self._has_poet = True
44-
RequestFingerprinter = ScrapyPoetRequestFingerprinter
56+
DefaultFallbackRequestFingerprinter = ScrapyRequestFingerprinter
4557
self._fallback_request_fingerprinter = _build_from_crawler(
4658
load_object(
4759
settings.get(
4860
"ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS",
49-
RequestFingerprinter,
61+
DefaultFallbackRequestFingerprinter,
5062
)
5163
),
5264
crawler,
5365
)
54-
if self._has_poet and not isinstance(
55-
self._fallback_request_fingerprinter, cast(type, RequestFingerprinter)
66+
if poet_is_configured and not isinstance(
67+
self._fallback_request_fingerprinter,
68+
cast(type, DefaultFallbackRequestFingerprinter),
5669
):
5770
logger.warning(
58-
f"You have scrapy-poet installed, but your custom value "
59-
f"for the ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS "
60-
f"setting "
71+
f"scrapy-poet is enabled, but your custom value for the "
72+
f"ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS setting "
6173
f"({settings['ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS']!r})"
6274
f" does not point to a subclass of "
6375
f"scrapy_poet.ScrapyPoetRequestFingerprinter. For request "
@@ -69,7 +81,7 @@ def __init__(self, crawler):
6981
f"to the SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS "
7082
f"setting instead."
7183
)
72-
self._has_poet = False
84+
self._fallback_fingerprinter_is_poets = False
7385
self._cache: "WeakKeyDictionary[Request, bytes]" = WeakKeyDictionary()
7486
self._param_parser = _ParamParser(crawler, cookies_enabled=False)
7587
self._crawler = crawler
@@ -124,7 +136,7 @@ def fingerprint(self, request):
124136
api_params.setdefault("sessionContext", session_pool)
125137
self._normalize_params(api_params)
126138
fingerprint = json.dumps(api_params, sort_keys=True).encode()
127-
if self._has_poet:
139+
if self._fallback_fingerprinter_is_poets:
128140
deps_key = self._fallback_request_fingerprinter.get_deps_key(
129141
request
130142
)

tests/test_request_fingerprinter.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,17 +57,42 @@ def fingerprint(self, request):
5757
assert fingerprinter.fingerprint(request) == b"foo"
5858
request = Request("https://example.com", meta={"zyte_api": True})
5959
assert fingerprinter.fingerprint(request) != b"foo"
60-
try:
61-
import scrapy_poet # noqa: F401
62-
except ImportError:
63-
pass
64-
else:
60+
if scrapy_poet is not None:
6561
assert (
6662
"does not point to a subclass of scrapy_poet.ScrapyPoetRequestFingerprinter"
6763
in caplog.text
6864
)
6965

7066

67+
@pytest.mark.skipif(scrapy_poet is None, reason="scrapy-poet is not installed")
68+
@ensureDeferred
69+
async def test_poet_installed_but_disabled(caplog):
70+
"""If the scrapy-poet package is installed but its main middleware,
71+
InjectionMiddleware, is not set in DOWNLOADER_MIDDLEWARES, do not try to
72+
use its API for request fingerprinting."""
73+
from web_poet import WebPage
74+
75+
no_deps_request = Request("https://example.com")
76+
77+
class DepsSpider(Spider):
78+
name = "deps"
79+
80+
def __init__(self, *args, **kwargs):
81+
self.deps_request = Request("https://example.com", callback=self.parse_deps)
82+
83+
def parse_deps(self, response, a: WebPage):
84+
pass
85+
86+
crawler = await get_crawler(
87+
spider_cls=DepsSpider, settings={"ZYTE_API_TRANSPARENT_MODE": True}, poet=False
88+
)
89+
fingerprinter = crawler.request_fingerprinter
90+
91+
no_deps_fp = fingerprinter.fingerprint(no_deps_request)
92+
deps_fp = fingerprinter.fingerprint(crawler.spider.deps_request)
93+
assert no_deps_fp == deps_fp
94+
95+
7196
@ensureDeferred
7297
async def test_fallback_default():
7398
crawler = await get_crawler()

0 commit comments

Comments
 (0)