@@ -2,16 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
22import { jsonRes } from '@fastgpt/service/common/response' ;
33import { authCert } from '@fastgpt/service/support/permission/auth/common' ;
44import { addHours } from 'date-fns' ;
5- import { MongoImage } from '@fastgpt/service/common/file/image/schema' ;
6- import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema' ;
75import {
86 checkInvalidDatasetFiles ,
97 checkInvalidDatasetData ,
108 checkInvalidVector
119} from '@/service/common/system/cronTask' ;
10+ import dayjs from 'dayjs' ;
11+ import { retryFn } from '@fastgpt/global/common/system/utils' ;
12+ import { NextAPI } from '@/service/middleware/entry' ;
13+ import { useIPFrequencyLimit } from '@fastgpt/service/common/middle/reqFrequencyLimit' ;
14+ import { MongoImage } from '@fastgpt/service/common/file/image/schema' ;
15+ import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema' ;
1216
1317let deleteImageAmount = 0 ;
14- async function checkInvalidImg ( start : Date , end : Date , limit = 50 ) {
18+ async function checkInvalidImg ( start : Date , end : Date ) {
1519 const images = await MongoImage . find (
1620 {
1721 createTime : {
@@ -52,22 +56,46 @@ async function checkInvalidImg(start: Date, end: Date, limit = 50) {
5256 console . log ( `检测完成,共删除 ${ deleteImageAmount } 个无效图片` ) ;
5357}
5458
55- /* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
56- export default async function handler ( req : NextApiRequest , res : NextApiResponse ) {
59+ async function handler ( req : NextApiRequest , res : NextApiResponse ) {
60+ deleteImageAmount = 0 ;
5761 try {
5862 await authCert ( { req, authRoot : true } ) ;
5963 const { start = - 2 , end = - 360 * 24 } = req . body as { start : number ; end : number } ;
6064
6165 ( async ( ) => {
6266 try {
6367 console . log ( '执行脏数据清理任务' ) ;
64- // 360天 ~ 2小时前
65- const endTime = addHours ( new Date ( ) , start ) ;
66- const startTime = addHours ( new Date ( ) , end ) ;
67- await checkInvalidDatasetFiles ( startTime , endTime ) ;
68- await checkInvalidImg ( startTime , endTime ) ;
69- await checkInvalidDatasetData ( startTime , endTime ) ;
70- await checkInvalidVector ( startTime , endTime ) ;
68+
69+ // Split time range into 6-hour chunks to avoid processing too much data at once
70+ const totalHours = Math . abs ( start - end ) ;
71+ const chunkHours = 6 ;
72+ const chunks = Math . ceil ( totalHours / chunkHours ) ;
73+
74+ console . log (
75+ `Total time range: ${ totalHours } hours, split into ${ chunks } chunks of ${ chunkHours } hours each`
76+ ) ;
77+
78+ for ( let i = 0 ; i < chunks ; i ++ ) {
79+ const chunkStart = start - i * chunkHours ;
80+ const chunkEnd = Math . max ( start - ( i + 1 ) * chunkHours , end ) ;
81+
82+ const chunkEndTime = addHours ( new Date ( ) , chunkStart ) ;
83+ const chunkStartTime = addHours ( new Date ( ) , chunkEnd ) ;
84+
85+ console . log (
86+ `Processing chunk ${ i + 1 } /${ chunks } : ${ dayjs ( chunkStartTime ) . format (
87+ 'YYYY-MM-DD HH:mm'
88+ ) } to ${ dayjs ( chunkEndTime ) . format ( 'YYYY-MM-DD HH:mm' ) } `
89+ ) ;
90+
91+ await retryFn ( ( ) => checkInvalidDatasetFiles ( chunkStartTime , chunkEndTime ) ) ;
92+ await retryFn ( ( ) => checkInvalidImg ( chunkStartTime , chunkEndTime ) ) ;
93+ await retryFn ( ( ) => checkInvalidDatasetData ( chunkStartTime , chunkEndTime ) ) ;
94+ await retryFn ( ( ) => checkInvalidVector ( chunkStartTime , chunkEndTime ) ) ;
95+
96+ console . log ( `Chunk ${ i + 1 } /${ chunks } completed` ) ;
97+ }
98+
7199 console . log ( '执行脏数据清理任务完毕' ) ;
72100 } catch ( error ) {
73101 console . log ( '执行脏数据清理任务出错了' ) ;
@@ -86,3 +114,5 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
86114 } ) ;
87115 }
88116}
117+
118+ export default NextAPI ( useIPFrequencyLimit ( { id : 'admin-api' , seconds : 60 , limit : 1 } ) , handler ) ;
0 commit comments