Skip to content

Commit 91c7a2d

Browse files
committed
新增作品评论数据抓取功能
1 parent 3f1b61d commit 91c7a2d

File tree

5 files changed

+202
-85
lines changed

5 files changed

+202
-85
lines changed

README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
<img src="https://img.shields.io/github/v/release/JoeanAmier/TikTokDownloader" alt="TikTokDownloader">
99
</a>
1010
<img src="https://img.shields.io/badge/%E8%BD%BB%E9%87%8F%E7%BA%A7-%E5%B7%A5%E5%85%B7-green" alt="轻量级工具">
11-
<img src="https://img.shields.io/badge/%E7%BE%A4%E8%81%8A-830227445-a6559d" alt="QQ群聊">
11+
<img src="https://img.shields.io/badge/QQ%E7%BE%A4%E8%81%8A-830227445-b44c97" alt="QQ群聊">
1212
</div>
1313
<br>
14-
<p>🔥 <b>抖音视频/图集/直播下载工具:</b>基于 Requests 模块实现;批量下载抖音账号发布页或者喜欢页的作品;单独下载抖音链接对应的作品;获取抖音直播推流地址;下载抖音直播视频。</p>
14+
<p>🔥 <b>抖音视频/图集/直播下载工具:</b>基于 Requests 模块实现;批量下载抖音账号发布页或者喜欢页的作品;单独下载抖音链接对应的作品;获取抖音直播推流地址;下载抖音直播视频;抓取作品评论数据。</p>
1515
<p>⭐ <b>使用者在使用本项目的代码时,请遵守 <a href="https://github.com/JoeanAmier/TikTokDownloader/blob/master/license">GNU General Public License v3.0</a> 开源协议。</b></p>
1616
<hr>
1717

@@ -27,15 +27,16 @@
2727
* ✅ 获取直播推流地址
2828
* ✅ 下载抖音直播视频
2929
* ✅ Web UI 交互界面
30-
* ☑️ 抓取作品评论数据
30+
* 抓取作品评论数据
3131
* ☑️ 下载 TikTok 无水印视频/图集
3232

3333
# 📈 项目状态
3434

3535
* 🟢 [Releases](https://github.com/JoeanAmier/TikTokDownloader/releases/latest) 发布的源码已通过测试,功能均可正常使用
3636
* 🟢 已完成 Web UI 交互界面
37+
* 🟢 已完成作品评论数据抓取功能
38+
* 🟡 准备开发作品评论回复抓取功能
3739
* 🟡 准备开发多进程模式,提高多账号批量下载效率
38-
* 🟡 准备加入作品评论抓取功能
3940
* 🔴 最新版本的源码可能存在不稳定的Bug
4041
* 🔴 暂未发现影响使用的Bug,如果在使用过程中发现Bug,请及时告知作者修复
4142

@@ -130,7 +131,7 @@ TikTokDownloader
130131
| time | str | 发布时间的格式,默认值:年-月-日 时.分.秒<br>(注意:Windows下文件名不能包含英文冒号“:”) |
131132
| split | str | 文件命名的分隔符,默认值:“-” |
132133
| music | list\[bool\] | 是否下载视频和图集的音乐,默认值:False |
133-
| save | str | 详细数据保存格式,设置为空字符串代表不保存<br>目前支持: csv、xlsx、sql(SQLite) |
134+
| save | str | 作品和评论数据保存格式,设置为空字符串代表不保存<br>目前支持: csv、xlsx、sql(SQLite) |
134135
| cookie | list\[str\] | 抖音网页版Cookie,必需参数<br>可以使用 Cookie_tool.py 写入配置文件 |
135136
| dynamic | list\[bool\] | 是否下载动态封面图,默认值:False |
136137
| original | list\[bool\] | 是否下载静态封面图,默认值:False |

src/DataAcquirer.py

Lines changed: 119 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,11 @@ def reset(function):
2525
def inner(self, *args, **kwargs):
2626
if not isinstance(self.url, bool):
2727
self.id_ = None
28+
self.data = None
29+
self.comment = []
30+
self.cursor = 0
2831
self.max_cursor = 0
29-
self.list = None # 未处理的数据
32+
self.list = [] # 未处理的数据
3033
self.name = None # 账号昵称
3134
self.video_data = [] # 视频ID数据
3235
self.image_data = [] # 图集ID数据
@@ -48,8 +51,8 @@ def inner(self, *args, **kwargs):
4851
return inner
4952

5053

51-
def retry(max_num=3):
52-
"""发生错误时尝试重新执行"""
54+
def retry(max_num=10):
55+
"""发生错误时尝试重新执行,装饰的函数需要返回布尔值"""
5356

5457
def inner(function):
5558
def execute(self, *args, **kwargs):
@@ -73,17 +76,27 @@ class UserData:
7376
r"^https://www\.douyin\.com/user/([a-zA-z0-9-_]+)(?:\?modal_id=([0-9]{19}))?.*$") # 账号链接
7477
works_link = re.compile(
7578
r"^https://www\.douyin\.com/(?:video|note)/([0-9]{19})$") # 作品链接
76-
live_link = re.compile(r"^https://live\.douyin\.com/([0-9]+)$") # 直播链接
79+
live_link = re.compile(r"^https://live\.douyin\.com/([0-9]+)\?*.+") # 直播链接
7780
live_api = "https://live.douyin.com/webcast/room/web/enter/" # 直播API
81+
comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/" # 评论API
82+
reply_api = "https://www.douyin.com/aweme/v1/web/comment/list/reply/" # 评论回复API
83+
"""评论回复API参数
84+
"item_id": "7248064381664136486",
85+
"comment_id": "7248089935747449604",
86+
"""
7887
clean = Cleaner() # 过滤非法字符
88+
max_comment = 256 # 评论字数限制
7989

8090
def __init__(self, log: LoggerManager):
8191
self.xb = XBogus() # 加密参数对象
8292
self.log = log # 日志记录对象
93+
self.data = None # 数据记录对象,仅评论抓取调用
8394
self._cookie = False # 是否设置了Cookie
8495
self.id_ = None # sec_uid or item_ids
85-
self.max_cursor = 0
86-
self.list = None # 未处理的数据
96+
self.comment = [] # 评论数据
97+
self.cursor = 0 # 评论页使用
98+
self.max_cursor = 0 # 发布页和喜欢页使用
99+
self.list = [] # 未处理的数据
87100
self.name = None # 账号昵称
88101
self.video_data = [] # 视频ID数据
89102
self.image_data = [] # 图集ID数据
@@ -94,6 +107,7 @@ def __init__(self, log: LoggerManager):
94107
self._url = None # 账号链接
95108
self._api = None # 批量下载类型
96109
self._proxies = None # 代理
110+
self._time = None # 创建时间格式
97111

98112
@property
99113
def url(self):
@@ -199,6 +213,24 @@ def proxies(self, value):
199213
"ftp": None,
200214
}
201215

216+
@property
217+
def time(self):
218+
return self._time
219+
220+
@time.setter
221+
def time(self, value):
222+
if value:
223+
try:
224+
_ = time.strftime(value, time.localtime())
225+
self._time = value
226+
self.log.info(f"时间格式设置成功: {value}", False)
227+
except ValueError:
228+
self.log.warning(f"时间格式错误: {value},将使用默认时间格式(年-月-日 时.分.秒)")
229+
self._time = "%Y-%m-%d %H.%M.%S"
230+
else:
231+
self.log.warning("错误的时间格式,将使用默认时间格式(年-月-日 时.分.秒)")
232+
self._time = "%Y-%m-%d %H.%M.%S"
233+
202234
@retry(max_num=5)
203235
def get_id(self, value="sec_user_id", url=None):
204236
"""获取账号ID或者作品ID"""
@@ -252,26 +284,23 @@ def get_user_data(self):
252284
proxies=self.proxies,
253285
timeout=10)
254286
except requests.exceptions.ReadTimeout:
255-
print("请求超时")
287+
self.log.error("请求超时")
256288
return False
257289
sleep()
258290
if response.status_code == 200:
259291
try:
260292
data = response.json()
261293
except requests.exceptions.JSONDecodeError:
262-
self.list = []
263294
self.log.error("数据接口返回内容异常!疑似接口失效", False)
264295
return False
265296
try:
266297
self.max_cursor = data['max_cursor']
267298
self.list = data["aweme_list"]
268299
return True
269300
except KeyError:
270-
self.list = []
271301
self.log.error(f"响应内容异常: {data}", False)
272302
return False
273303
else:
274-
self.list = []
275304
self.log.error(f"响应码异常:{response.status_code},获取JSON数据失败")
276305
return False
277306

@@ -325,28 +354,31 @@ def get_nickname(self):
325354
self.name = str(time.time())[:10]
326355
self.log.warning(
327356
f"请求超时,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
328-
return
357+
return False
329358
if response.status_code == 200:
330359
try:
331360
data = response.json()
332361
except requests.exceptions.JSONDecodeError:
333362
self.name = str(time.time())[:10]
334363
self.log.warning(
335364
f"数据接口返回内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
336-
return
365+
return False
337366
try:
338367
self.name = self.clean.filter(
339368
data["aweme_list"][0]["author"]["nickname"]) or str(
340369
time.time())[
341370
:10]
371+
return True
342372
except KeyError:
343373
self.name = str(time.time())[:10]
344374
self.log.warning(
345375
f"响应内容异常,获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
376+
return False
346377
else:
347378
self.name = str(time.time())[:10]
348379
self.log.warning(
349380
f"响应码异常:{response.status_code},获取账号昵称失败,本次运行将默认使用当前时间戳作为帐号昵称: {self.name}")
381+
return False
350382

351383
def early_stop(self):
352384
"""如果获取数据的发布日期已经早于限制日期,就不需要再获取下一页的数据了"""
@@ -391,9 +423,6 @@ def run(self, index: int):
391423
@check_cookie
392424
def run_alone(self, text: str):
393425
"""单独下载模式"""
394-
if not self.cookie:
395-
self.log.warning("请检查Cookie是否正确")
396-
return False
397426
url = self.check_url(text)
398427
if not url:
399428
self.log.warning("无效的作品链接")
@@ -455,7 +484,7 @@ def get_live_data(self, link: str):
455484
proxies=self.proxies)
456485
return response.json()
457486
except requests.exceptions.ReadTimeout:
458-
print("请求超时")
487+
self.log.warning("请求超时")
459488
return False
460489
except requests.exceptions.JSONDecodeError:
461490
self.log.warning("直播数据接口返回内容格式错误")
@@ -472,40 +501,78 @@ def deal_live_data(self, data):
472501
cover = data["data"]["data"][0]["cover"]["url_list"][0]
473502
return nickname, title, url, cover
474503

504+
@reset
505+
@check_cookie
506+
def run_comment(self, id_: str, data):
507+
self.data = data
508+
while not self.finish:
509+
self.get_comment(id_)
510+
self.deal_comment()
475511

476-
class CommentData:
477-
comment_api = "https://www.douyin.com/aweme/v1/web/comment/list/"
478-
params = {
479-
"device_platform": "webapp",
480-
"aid": "6383",
481-
"channel": "channel_pc_web",
482-
"aweme_id": "7246706009123720503",
483-
"cursor": "0",
484-
"count": "20",
485-
"item_type": "0",
486-
"insert_ids": "",
487-
"rcFT": "",
488-
"pc_client_type": "1",
489-
"version_code": "170400",
490-
"version_name": "17.4.0",
491-
"cookie_enabled": "true",
492-
"screen_width": "1536",
493-
"screen_height": "864",
494-
"browser_language": "zh-CN",
495-
"browser_platform": "Win32",
496-
"browser_name": "Edge",
497-
"browser_version": "114.0.1823.58",
498-
"browser_online": "true",
499-
"engine_name": "Blink",
500-
"engine_version": "114.0.0.0",
501-
"os_name": "Windows",
502-
"os_version": "10",
503-
"cpu_core_num": "16",
504-
"device_memory": "8",
505-
"platform": "PC",
506-
"downlink": "10",
507-
"effective_type": "4g",
508-
"round_trip_time": "50",
509-
"webid": "7248584490175383100",
510-
"msToken": "FX-6vWAx3sPmINCegC_qzzS46gfcN9LHHoaaKBtf8DYrBSmGXT803q4j0uzx0fDkFFUj1bPrkfA6O1tBTwUJi4RZGz3OkqEqI8RtIBu1X1NBeT60BHItrM2gK3jRVdI=",
511-
"X-Bogus": "DFSzswVL6lJANSwctnrmvGUClLxV"}
512+
@retry(max_num=5)
513+
def get_comment(self, id_: str):
514+
params = {
515+
"aid": "6383",
516+
"aweme_id": id_,
517+
"cursor": self.cursor,
518+
"count": "20",
519+
"cookie_enabled": "true",
520+
"platform": "PC", }
521+
params = self.deal_params(params)
522+
try:
523+
response = requests.get(
524+
self.comment_api,
525+
params=params,
526+
headers=self.headers,
527+
proxies=self.proxies,
528+
timeout=10)
529+
except requests.exceptions.ReadTimeout:
530+
self.log.error("请求超时")
531+
return False
532+
sleep()
533+
if response.status_code == 200:
534+
try:
535+
data = response.json()
536+
except requests.exceptions.JSONDecodeError:
537+
self.log.error("数据接口返回内容异常!疑似接口失效", False)
538+
return False
539+
try:
540+
self.comment = data["comments"]
541+
self.cursor = data["cursor"]
542+
return True
543+
except KeyError:
544+
self.log.error(f"响应内容异常: {data}", False)
545+
return False
546+
else:
547+
self.log.error(f"响应码异常:{response.status_code},获取JSON数据失败")
548+
return False
549+
550+
def deal_comment(self):
551+
if not self.comment:
552+
self.log.info("该作品的评论数据获取结束")
553+
self.finish = True
554+
return
555+
for item in self.comment:
556+
"""数据格式: 评论ID, 评论时间, 用户昵称, IP归属地, 评论内容, 点赞数量, 回复数量, 回复ID"""
557+
create_time = time.strftime(
558+
self.time,
559+
time.localtime(
560+
item["create_time"]))
561+
ip_label = item["ip_label"]
562+
text = item["text"][:self.max_comment]
563+
nickname = item["user"]["nickname"]
564+
digg_count = str(item["digg_count"])
565+
cid = item["cid"]
566+
reply_comment_total = str(item["reply_comment_total"])
567+
reply_id = item["reply_id"]
568+
result = [
569+
cid,
570+
create_time,
571+
nickname,
572+
ip_label,
573+
text,
574+
digg_count,
575+
reply_comment_total,
576+
reply_id]
577+
self.log.info("评论: " + ", ".join(result))
578+
self.data.save(result)

src/DataDownloader.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -238,25 +238,22 @@ def get_data(self, item):
238238
}
239239
xb = self.xb.get_x_bogus(urlencode(params))
240240
params["X-Bogus"] = xb
241-
for _ in range(3): # 获取数据为空时重新尝试
242-
try:
243-
response = requests.get(
244-
self.item_ids_api,
245-
params=params,
246-
proxies=self.proxies,
247-
headers=self.headers, timeout=10)
248-
sleep()
249-
if response.status_code == 200 and response.text:
250-
try:
251-
return response.json()["aweme_detail"]
252-
except (KeyError, IndexError):
253-
self.log.error(f"响应内容异常: {response.json()}", False)
254-
return False
255-
except requests.exceptions.ReadTimeout:
256-
continue
257-
self.log.error(
258-
f"资源 {item} 获取 item_list 失败")
259-
return False
241+
try:
242+
response = requests.get(
243+
self.item_ids_api,
244+
params=params,
245+
proxies=self.proxies,
246+
headers=self.headers, timeout=10)
247+
sleep()
248+
if response.status_code == 200 and response.text:
249+
try:
250+
return response.json()["aweme_detail"]
251+
except (KeyError, IndexError):
252+
self.log.error(f"响应内容异常: {response.json()}", False)
253+
return False
254+
except requests.exceptions.ReadTimeout:
255+
self.log.error(f"请求超时,资源 {item} 获取 item_list 失败")
256+
return False
260257

261258
def get_info(self, data, type_):
262259
"""
@@ -444,7 +441,7 @@ def run_alone(self, id_: str, download=True):
444441
self.create_folder(self.folder)
445442
data = self.get_data(id_)
446443
if not data:
447-
self.log.warning("获取作品详细信息失败")
444+
self.log.warning("获取作品详细信息失败")
448445
return False
449446
self.nickname = self.clean.filter(data["author"]["nickname"])
450447
if data["images"]:

0 commit comments

Comments
 (0)