88import logging
99from datetime import datetime , timedelta
1010from collections import defaultdict , Counter
11- from typing import List , Optional , Dict , Set , NewType , Tuple , Iterable , Any , Generic , TypeVar , TypedDict , Union , Type
11+ from typing import (
12+ List ,
13+ Optional ,
14+ Dict ,
15+ Set ,
16+ NewType ,
17+ Tuple ,
18+ Iterable ,
19+ Any ,
20+ Generic ,
21+ TypeVar ,
22+ TypedDict ,
23+ Union ,
24+ Type ,
25+ Mapping ,
26+ cast ,
27+ )
1228
1329import dateparser
1430from typing_extensions import NotRequired
@@ -24,7 +40,7 @@ def hash_mod(text, divisor):
2440 returns the module of dividing text md5 hash over given divisor
2541 """
2642 if isinstance (text , str ):
27- text = text .encode (' utf8' )
43+ text = text .encode (" utf8" )
2844 md5 = hashlib .md5 ()
2945 md5 .update (text )
3046 digest = md5 .hexdigest ()
@@ -130,9 +146,10 @@ def process_item(self, item: ITEMTYPE, input_source: InputSource):
130146
131147 self .processed_count += 1
132148
133- def get_output_slot_for_item (self , item : ITEMTYPE ) -> Union [Slot , None ]:
149+ def get_output_slot_for_item (self , item : ITEMTYPE , input_field = "id" ) -> Union [Slot , None ]:
150+ input_value = cast (Mapping [str , Any ], item )[input_field ]
134151 return (
135- Slot (str (hash_mod (item [ "id" ] , self .parallel_outputs )))
152+ Slot (str (hash_mod (input_value , self .parallel_outputs )))
136153 if self .output_slot is None and self .parallel_outputs > 1
137154 else self .output_slot
138155 )
@@ -391,7 +408,7 @@ def add_argparser_options(self):
391408 super ().add_argparser_options ()
392409 self .argparser .add_argument (
393410 "target" ,
394- help = "Which spiders to target. In format <type>:<name>, where type can be spider, canonical or class."
411+ help = "Which spiders to target. In format <type>:<name>, where type can be spider, canonical or class." ,
395412 )
396413
397414 def parse_args (self ):
@@ -407,9 +424,14 @@ def get_new_inputs(self) -> Iterable[Tuple[InputSource, Tuple[JobDict, SpiderNam
407424 for spidername in self .spider_loader .list ():
408425 canonical_name = self .get_canonical_spidername (SpiderName (spidername ))
409426 spidercls = self .spider_loader .load (spidername )
410- if self ._target_type == "spider" and spidername == self ._target_name or \
411- self ._target_type == "canonical" and canonical_name == self ._target_name or \
412- self ._target_type == "class" and self ._target_name in [c .__name__ for c in spidercls .mro ()]:
427+ if (
428+ self ._target_type == "spider"
429+ and spidername == self ._target_name
430+ or self ._target_type == "canonical"
431+ and canonical_name == self ._target_name
432+ or self ._target_type == "class"
433+ and self ._target_name in [c .__name__ for c in spidercls .mro ()]
434+ ):
413435 for jdict in self .get_jobs (
414436 spider = spidername , state = ["finished" ], meta = ["spider_args" ], lacks_tag = self .CONSUMED_TAG
415437 ):
0 commit comments