Skip to content

Commit 6b2ea69

Browse files
authored
feat: operation index (#5056)
* feat: operation index * fix: delete update vector * perf: Clear invalid data * perf: index * perf: cleare invalid data * index
1 parent 6060543 commit 6b2ea69

File tree

8 files changed

+84
-31
lines changed

8 files changed

+84
-31
lines changed

docSite/content/zh-cn/docs/development/upgrading/4913.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,5 @@ weight: 787
2121

2222
1. 对话日志,日期范围选择问题。
2323
2. API 调用时,传入的 system 提示词可能会重复。
24-
3. AI 对话/工具调用,未选择文件链接时,也会从历史记录读取文件。
24+
3. AI 对话/工具调用,未选择文件链接时,也会从历史记录读取文件。
25+
4. 手动更新知识库索引时,错误的删除旧索引,导致手动索引无效。

packages/service/common/file/image/schema.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ try {
2323
ImageSchema.index({ type: 1 });
2424
// delete related img
2525
ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 });
26+
27+
// Cron clear invalid img
28+
ImageSchema.index(
29+
{ createTime: 1 },
30+
{ partialFilterExpression: { 'metadata.relatedId': { $exists: true } } }
31+
);
2632
} catch (error) {
2733
console.log(error);
2834
}

packages/service/core/dataset/collection/schema.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,12 @@ try {
132132
}
133133
}
134134
);
135+
136+
// Clear invalid image
137+
DatasetCollectionSchema.index({
138+
teamId: 1,
139+
'metadata.relatedImgId': 1
140+
});
135141
} catch (error) {
136142
console.log(error);
137143
}

packages/service/core/dataset/data/schema.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,14 @@ try {
103103
});
104104
// Recall vectors after data matching
105105
DatasetDataSchema.index({ teamId: 1, datasetId: 1, collectionId: 1, 'indexes.dataId': 1 });
106-
DatasetDataSchema.index({ updateTime: 1 });
107106
// rebuild data
108107
DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 });
109108

110109
// 为查询 initJieba 字段不存在的数据添加索引
111110
DatasetDataSchema.index({ initJieba: 1, updateTime: 1 });
111+
112+
// Cron clear invalid data
113+
DatasetDataSchema.index({ updateTime: 1 });
112114
} catch (error) {
113115
console.log(error);
114116
}

packages/service/support/operationLog/schema.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import {
66
TeamMemberCollectionName
77
} from '@fastgpt/global/support/user/team/constant';
88

9-
export const OperationLogCollectionName = 'operationLog';
9+
export const OperationLogCollectionName = 'operationLogs';
1010

1111
const OperationLogSchema = new Schema({
1212
tmbId: {
@@ -34,6 +34,9 @@ const OperationLogSchema = new Schema({
3434
}
3535
});
3636

37+
OperationLogSchema.index({ teamId: 1, tmbId: 1, event: 1 });
38+
OperationLogSchema.index({ timestamp: 1 }, { expireAfterSeconds: 14 * 24 * 60 * 60 }); // Auto delete after 14 days
39+
3740
export const MongoOperationLog = getMongoLogModel<OperationLogSchema>(
3841
OperationLogCollectionName,
3942
OperationLogSchema

projects/app/src/pages/api/admin/clearInvalidData.ts

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
22
import { jsonRes } from '@fastgpt/service/common/response';
33
import { authCert } from '@fastgpt/service/support/permission/auth/common';
44
import { addHours } from 'date-fns';
5-
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
6-
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
75
import {
86
checkInvalidDatasetFiles,
97
checkInvalidDatasetData,
108
checkInvalidVector
119
} from '@/service/common/system/cronTask';
10+
import dayjs from 'dayjs';
11+
import { retryFn } from '@fastgpt/global/common/system/utils';
12+
import { NextAPI } from '@/service/middleware/entry';
13+
import { useIPFrequencyLimit } from '@fastgpt/service/common/middle/reqFrequencyLimit';
14+
import { MongoImage } from '@fastgpt/service/common/file/image/schema';
15+
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
1216

1317
let deleteImageAmount = 0;
14-
async function checkInvalidImg(start: Date, end: Date, limit = 50) {
18+
async function checkInvalidImg(start: Date, end: Date) {
1519
const images = await MongoImage.find(
1620
{
1721
createTime: {
@@ -52,22 +56,46 @@ async function checkInvalidImg(start: Date, end: Date, limit = 50) {
5256
console.log(`检测完成,共删除 ${deleteImageAmount} 个无效图片`);
5357
}
5458

55-
/* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */
56-
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
59+
async function handler(req: NextApiRequest, res: NextApiResponse) {
60+
deleteImageAmount = 0;
5761
try {
5862
await authCert({ req, authRoot: true });
5963
const { start = -2, end = -360 * 24 } = req.body as { start: number; end: number };
6064

6165
(async () => {
6266
try {
6367
console.log('执行脏数据清理任务');
64-
// 360天 ~ 2小时前
65-
const endTime = addHours(new Date(), start);
66-
const startTime = addHours(new Date(), end);
67-
await checkInvalidDatasetFiles(startTime, endTime);
68-
await checkInvalidImg(startTime, endTime);
69-
await checkInvalidDatasetData(startTime, endTime);
70-
await checkInvalidVector(startTime, endTime);
68+
69+
// Split time range into 6-hour chunks to avoid processing too much data at once
70+
const totalHours = Math.abs(start - end);
71+
const chunkHours = 6;
72+
const chunks = Math.ceil(totalHours / chunkHours);
73+
74+
console.log(
75+
`Total time range: ${totalHours} hours, split into ${chunks} chunks of ${chunkHours} hours each`
76+
);
77+
78+
for (let i = 0; i < chunks; i++) {
79+
const chunkStart = start - i * chunkHours;
80+
const chunkEnd = Math.max(start - (i + 1) * chunkHours, end);
81+
82+
const chunkEndTime = addHours(new Date(), chunkStart);
83+
const chunkStartTime = addHours(new Date(), chunkEnd);
84+
85+
console.log(
86+
`Processing chunk ${i + 1}/${chunks}: ${dayjs(chunkStartTime).format(
87+
'YYYY-MM-DD HH:mm'
88+
)} to ${dayjs(chunkEndTime).format('YYYY-MM-DD HH:mm')}`
89+
);
90+
91+
await retryFn(() => checkInvalidDatasetFiles(chunkStartTime, chunkEndTime));
92+
await retryFn(() => checkInvalidImg(chunkStartTime, chunkEndTime));
93+
await retryFn(() => checkInvalidDatasetData(chunkStartTime, chunkEndTime));
94+
await retryFn(() => checkInvalidVector(chunkStartTime, chunkEndTime));
95+
96+
console.log(`Chunk ${i + 1}/${chunks} completed`);
97+
}
98+
7199
console.log('执行脏数据清理任务完毕');
72100
} catch (error) {
73101
console.log('执行脏数据清理任务出错了');
@@ -86,3 +114,5 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
86114
});
87115
}
88116
}
117+
118+
export default NextAPI(useIPFrequencyLimit({ id: 'admin-api', seconds: 60, limit: 1 }), handler);

projects/app/src/service/common/system/cronTask.ts

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -145,16 +145,20 @@ export async function checkInvalidDatasetData(start: Date, end: Date) {
145145
datasetId: item.datasetId,
146146
collectionId: item.collectionId
147147
});
148-
await MongoDatasetDataText.deleteMany({
149-
teamId: item.teamId,
150-
datasetId: item.datasetId,
151-
collectionId: item.collectionId
152-
});
153-
await deleteDatasetDataVector({
154-
teamId: item.teamId,
155-
datasetIds: [item.datasetId],
156-
collectionIds: [item.collectionId]
157-
});
148+
149+
await Promise.all([
150+
MongoDatasetDataText.deleteMany({
151+
teamId: item.teamId,
152+
datasetId: item.datasetId,
153+
collectionId: item.collectionId
154+
}),
155+
deleteDatasetDataVector({
156+
teamId: item.teamId,
157+
datasetIds: [item.datasetId],
158+
collectionIds: [item.collectionId]
159+
})
160+
]);
161+
158162
await MongoDatasetData.deleteMany({
159163
teamId: item.teamId,
160164
datasetId: item.datasetId,

projects/app/src/service/core/dataset/data/controller.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,11 @@ export async function updateData2Dataset({
318318
}
319319
}
320320

321+
const deleteVectorIdList = patchResult
322+
.filter((item) => item.type === 'delete' || item.type === 'update')
323+
.map((item) => item.index.dataId)
324+
.filter(Boolean) as string[];
325+
321326
// 4. Update mongo updateTime(便于脏数据检查器识别)
322327
const updateTime = mongoData.updateTime;
323328
mongoData.updateTime = new Date();
@@ -377,14 +382,10 @@ export async function updateData2Dataset({
377382
);
378383

379384
// Delete vector
380-
const deleteIdList = patchResult
381-
.filter((item) => item.type === 'delete' || item.type === 'update')
382-
.map((item) => item.index.dataId)
383-
.filter(Boolean) as string[];
384-
if (deleteIdList.length > 0) {
385+
if (deleteVectorIdList.length > 0) {
385386
await deleteDatasetDataVector({
386387
teamId: mongoData.teamId,
387-
idList: deleteIdList
388+
idList: deleteVectorIdList
388389
});
389390
}
390391
});

0 commit comments

Comments
 (0)