@@ -25,8 +25,11 @@ def reset(function):
2525 def inner (self , * args , ** kwargs ):
2626 if not isinstance (self .url , bool ):
2727 self .id_ = None
28+ self .data = None
29+ self .comment = []
30+ self .cursor = 0
2831 self .max_cursor = 0
29- self .list = None # 未处理的数据
32+ self .list = [] # 未处理的数据
3033 self .name = None # 账号昵称
3134 self .video_data = [] # 视频ID数据
3235 self .image_data = [] # 图集ID数据
@@ -48,8 +51,8 @@ def inner(self, *args, **kwargs):
4851 return inner
4952
5053
51- def retry (max_num = 3 ):
52- """发生错误时尝试重新执行"""
54+ def retry (max_num = 10 ):
55+ """发生错误时尝试重新执行,装饰的函数需要返回布尔值 """
5356
5457 def inner (function ):
5558 def execute (self , * args , ** kwargs ):
@@ -73,17 +76,27 @@ class UserData:
7376 r"^https://www\.douyin\.com/user/([a-zA-z0-9-_]+)(?:\?modal_id=([0-9]{19}))?.*$" ) # 账号链接
7477 works_link = re .compile (
7578 r"^https://www\.douyin\.com/(?:video|note)/([0-9]{19})$" ) # 作品链接
76- live_link = re .compile (r"^https://live\.douyin\.com/([0-9]+)$ " ) # 直播链接
79+ live_link = re .compile (r"^https://live\.douyin\.com/([0-9]+)\?*.+ " ) # 直播链接
7780 live_api = "https://live.douyin.com/webcast/room/web/enter/" # 直播API
81+ comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/" # 评论API
82+ reply_api = "https://www.douyin.com/aweme/v1/web/comment/list/reply/" # 评论回复API
83+ """评论回复API参数
84+ "item_id": "7248064381664136486",
85+ "comment_id": "7248089935747449604",
86+ """
7887 clean = Cleaner () # 过滤非法字符
88+ max_comment = 256 # 评论字数限制
7989
8090 def __init__ (self , log : LoggerManager ):
8191 self .xb = XBogus () # 加密参数对象
8292 self .log = log # 日志记录对象
93+ self .data = None # 数据记录对象,仅评论抓取调用
8394 self ._cookie = False # 是否设置了Cookie
8495 self .id_ = None # sec_uid or item_ids
85- self .max_cursor = 0
86- self .list = None # 未处理的数据
96+ self .comment = [] # 评论数据
97+ self .cursor = 0 # 评论页使用
98+ self .max_cursor = 0 # 发布页和喜欢页使用
99+ self .list = [] # 未处理的数据
87100 self .name = None # 账号昵称
88101 self .video_data = [] # 视频ID数据
89102 self .image_data = [] # 图集ID数据
@@ -94,6 +107,7 @@ def __init__(self, log: LoggerManager):
94107 self ._url = None # 账号链接
95108 self ._api = None # 批量下载类型
96109 self ._proxies = None # 代理
110+ self ._time = None # 创建时间格式
97111
98112 @property
99113 def url (self ):
@@ -199,6 +213,24 @@ def proxies(self, value):
199213 "ftp" : None ,
200214 }
201215
216+ @property
217+ def time (self ):
218+ return self ._time
219+
220+ @time .setter
221+ def time (self , value ):
222+ if value :
223+ try :
224+ _ = time .strftime (value , time .localtime ())
225+ self ._time = value
226+ self .log .info (f"时间格式设置成功: { value } " , False )
227+ except ValueError :
228+ self .log .warning (f"时间格式错误: { value } ,将使用默认时间格式(年-月-日 时.分.秒)" )
229+ self ._time = "%Y-%m-%d %H.%M.%S"
230+ else :
231+ self .log .warning ("错误的时间格式,将使用默认时间格式(年-月-日 时.分.秒)" )
232+ self ._time = "%Y-%m-%d %H.%M.%S"
233+
202234 @retry (max_num = 5 )
203235 def get_id (self , value = "sec_user_id" , url = None ):
204236 """获取账号ID或者作品ID"""
@@ -252,26 +284,23 @@ def get_user_data(self):
252284 proxies = self .proxies ,
253285 timeout = 10 )
254286 except requests .exceptions .ReadTimeout :
255- print ("请求超时! " )
287+ self . log . error ("请求超时" )
256288 return False
257289 sleep ()
258290 if response .status_code == 200 :
259291 try :
260292 data = response .json ()
261293 except requests .exceptions .JSONDecodeError :
262- self .list = []
263294 self .log .error ("数据接口返回内容异常!疑似接口失效" , False )
264295 return False
265296 try :
266297 self .max_cursor = data ['max_cursor' ]
267298 self .list = data ["aweme_list" ]
268299 return True
269300 except KeyError :
270- self .list = []
271301 self .log .error (f"响应内容异常: { data } " , False )
272302 return False
273303 else :
274- self .list = []
275304 self .log .error (f"响应码异常:{ response .status_code } ,获取JSON数据失败" )
276305 return False
277306
@@ -325,28 +354,31 @@ def get_nickname(self):
325354 self .name = str (time .time ())[:10 ]
326355 self .log .warning (
327356 f"请求超时,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: { self .name } " )
328- return
357+ return False
329358 if response .status_code == 200 :
330359 try :
331360 data = response .json ()
332361 except requests .exceptions .JSONDecodeError :
333362 self .name = str (time .time ())[:10 ]
334363 self .log .warning (
335364 f"数据接口返回内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: { self .name } " )
336- return
365+ return False
337366 try :
338367 self .name = self .clean .filter (
339368 data ["aweme_list" ][0 ]["author" ]["nickname" ]) or str (
340369 time .time ())[
341370 :10 ]
371+ return True
342372 except KeyError :
343373 self .name = str (time .time ())[:10 ]
344374 self .log .warning (
345375 f"响应内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: { self .name } " )
376+ return False
346377 else :
347378 self .name = str (time .time ())[:10 ]
348379 self .log .warning (
349380 f"响应码异常:{ response .status_code } ,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: { self .name } " )
381+ return False
350382
351383 def early_stop (self ):
352384 """如果获取数据的发布日期已经早于限制日期,就不需要再获取下一页的数据了"""
@@ -391,9 +423,6 @@ def run(self, index: int):
391423 @check_cookie
392424 def run_alone (self , text : str ):
393425 """单独下载模式"""
394- if not self .cookie :
395- self .log .warning ("请检查Cookie是否正确" )
396- return False
397426 url = self .check_url (text )
398427 if not url :
399428 self .log .warning ("无效的作品链接" )
@@ -455,7 +484,7 @@ def get_live_data(self, link: str):
455484 proxies = self .proxies )
456485 return response .json ()
457486 except requests .exceptions .ReadTimeout :
458- print ("请求超时! " )
487+ self . log . warning ("请求超时" )
459488 return False
460489 except requests .exceptions .JSONDecodeError :
461490 self .log .warning ("直播数据接口返回内容格式错误" )
@@ -472,40 +501,78 @@ def deal_live_data(self, data):
472501 cover = data ["data" ]["data" ][0 ]["cover" ]["url_list" ][0 ]
473502 return nickname , title , url , cover
474503
504+ @reset
505+ @check_cookie
506+ def run_comment (self , id_ : str , data ):
507+ self .data = data
508+ while not self .finish :
509+ self .get_comment (id_ )
510+ self .deal_comment ()
475511
476- class CommentData :
477- comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/"
478- params = {
479- "device_platform" : "webapp" ,
480- "aid" : "6383" ,
481- "channel" : "channel_pc_web" ,
482- "aweme_id" : "7246706009123720503" ,
483- "cursor" : "0" ,
484- "count" : "20" ,
485- "item_type" : "0" ,
486- "insert_ids" : "" ,
487- "rcFT" : "" ,
488- "pc_client_type" : "1" ,
489- "version_code" : "170400" ,
490- "version_name" : "17.4.0" ,
491- "cookie_enabled" : "true" ,
492- "screen_width" : "1536" ,
493- "screen_height" : "864" ,
494- "browser_language" : "zh-CN" ,
495- "browser_platform" : "Win32" ,
496- "browser_name" : "Edge" ,
497- "browser_version" : "114.0.1823.58" ,
498- "browser_online" : "true" ,
499- "engine_name" : "Blink" ,
500- "engine_version" : "114.0.0.0" ,
501- "os_name" : "Windows" ,
502- "os_version" : "10" ,
503- "cpu_core_num" : "16" ,
504- "device_memory" : "8" ,
505- "platform" : "PC" ,
506- "downlink" : "10" ,
507- "effective_type" : "4g" ,
508- "round_trip_time" : "50" ,
509- "webid" : "7248584490175383100" ,
510- "msToken" : "FX-6vWAx3sPmINCegC_qzzS46gfcN9LHHoaaKBtf8DYrBSmGXT803q4j0uzx0fDkFFUj1bPrkfA6O1tBTwUJi4RZGz3OkqEqI8RtIBu1X1NBeT60BHItrM2gK3jRVdI=" ,
511- "X-Bogus" : "DFSzswVL6lJANSwctnrmvGUClLxV" }
512+ @retry (max_num = 5 )
513+ def get_comment (self , id_ : str ):
514+ params = {
515+ "aid" : "6383" ,
516+ "aweme_id" : id_ ,
517+ "cursor" : self .cursor ,
518+ "count" : "20" ,
519+ "cookie_enabled" : "true" ,
520+ "platform" : "PC" , }
521+ params = self .deal_params (params )
522+ try :
523+ response = requests .get (
524+ self .comment_api ,
525+ params = params ,
526+ headers = self .headers ,
527+ proxies = self .proxies ,
528+ timeout = 10 )
529+ except requests .exceptions .ReadTimeout :
530+ self .log .error ("请求超时" )
531+ return False
532+ sleep ()
533+ if response .status_code == 200 :
534+ try :
535+ data = response .json ()
536+ except requests .exceptions .JSONDecodeError :
537+ self .log .error ("数据接口返回内容异常!疑似接口失效" , False )
538+ return False
539+ try :
540+ self .comment = data ["comments" ]
541+ self .cursor = data ["cursor" ]
542+ return True
543+ except KeyError :
544+ self .log .error (f"响应内容异常: { data } " , False )
545+ return False
546+ else :
547+ self .log .error (f"响应码异常:{ response .status_code } ,获取JSON数据失败" )
548+ return False
549+
550+ def deal_comment (self ):
551+ if not self .comment :
552+ self .log .info ("该作品的评论数据获取结束" )
553+ self .finish = True
554+ return
555+ for item in self .comment :
556+ """数据格式: 评论ID, 评论时间, 用户昵称, IP归属地, 评论内容, 点赞数量, 回复数量, 回复ID"""
557+ create_time = time .strftime (
558+ self .time ,
559+ time .localtime (
560+ item ["create_time" ]))
561+ ip_label = item ["ip_label" ]
562+ text = item ["text" ][:self .max_comment ]
563+ nickname = item ["user" ]["nickname" ]
564+ digg_count = str (item ["digg_count" ])
565+ cid = item ["cid" ]
566+ reply_comment_total = str (item ["reply_comment_total" ])
567+ reply_id = item ["reply_id" ]
568+ result = [
569+ cid ,
570+ create_time ,
571+ nickname ,
572+ ip_label ,
573+ text ,
574+ digg_count ,
575+ reply_comment_total ,
576+ reply_id ]
577+ self .log .info ("评论: " + ", " .join (result ))
578+ self .data .save (result )
0 commit comments