Skip to content

Commit 7efdb44

Browse files
committed
issuer: allow to use a different item field for selecting slot
1 parent 74218ec commit 7efdb44

File tree

1 file changed

+30
-8
lines changed

1 file changed

+30
-8
lines changed

shub_workflow/issuer/__init__.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,23 @@
88
import logging
99
from datetime import datetime, timedelta
1010
from collections import defaultdict, Counter
11-
from typing import List, Optional, Dict, Set, NewType, Tuple, Iterable, Any, Generic, TypeVar, TypedDict, Union, Type
11+
from typing import (
12+
List,
13+
Optional,
14+
Dict,
15+
Set,
16+
NewType,
17+
Tuple,
18+
Iterable,
19+
Any,
20+
Generic,
21+
TypeVar,
22+
TypedDict,
23+
Union,
24+
Type,
25+
Mapping,
26+
cast,
27+
)
1228

1329
import dateparser
1430
from typing_extensions import NotRequired
@@ -24,7 +40,7 @@ def hash_mod(text, divisor):
2440
returns the module of dividing text md5 hash over given divisor
2541
"""
2642
if isinstance(text, str):
27-
text = text.encode('utf8')
43+
text = text.encode("utf8")
2844
md5 = hashlib.md5()
2945
md5.update(text)
3046
digest = md5.hexdigest()
@@ -130,9 +146,10 @@ def process_item(self, item: ITEMTYPE, input_source: InputSource):
130146

131147
self.processed_count += 1
132148

133-
def get_output_slot_for_item(self, item: ITEMTYPE) -> Union[Slot, None]:
149+
def get_output_slot_for_item(self, item: ITEMTYPE, input_field="id") -> Union[Slot, None]:
150+
input_value = cast(Mapping[str, Any], item)[input_field]
134151
return (
135-
Slot(str(hash_mod(item["id"], self.parallel_outputs)))
152+
Slot(str(hash_mod(input_value, self.parallel_outputs)))
136153
if self.output_slot is None and self.parallel_outputs > 1
137154
else self.output_slot
138155
)
@@ -391,7 +408,7 @@ def add_argparser_options(self):
391408
super().add_argparser_options()
392409
self.argparser.add_argument(
393410
"target",
394-
help="Which spiders to target. In format <type>:<name>, where type can be spider, canonical or class."
411+
help="Which spiders to target. In format <type>:<name>, where type can be spider, canonical or class.",
395412
)
396413

397414
def parse_args(self):
@@ -407,9 +424,14 @@ def get_new_inputs(self) -> Iterable[Tuple[InputSource, Tuple[JobDict, SpiderNam
407424
for spidername in self.spider_loader.list():
408425
canonical_name = self.get_canonical_spidername(SpiderName(spidername))
409426
spidercls = self.spider_loader.load(spidername)
410-
if self._target_type == "spider" and spidername == self._target_name or \
411-
self._target_type == "canonical" and canonical_name == self._target_name or \
412-
self._target_type == "class" and self._target_name in [c.__name__ for c in spidercls.mro()]:
427+
if (
428+
self._target_type == "spider"
429+
and spidername == self._target_name
430+
or self._target_type == "canonical"
431+
and canonical_name == self._target_name
432+
or self._target_type == "class"
433+
and self._target_name in [c.__name__ for c in spidercls.mro()]
434+
):
413435
for jdict in self.get_jobs(
414436
spider=spidername, state=["finished"], meta=["spider_args"], lacks_tag=self.CONSUMED_TAG
415437
):

0 commit comments

Comments
 (0)