@@ -7,6 +7,10 @@ export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
77type SplitProps = {
88 text : string ;
99 chunkSize : number ;
10+
11+ paragraphChunkDeep ?: number ; // Paragraph deep
12+ paragraphChunkMinSize ?: number ; // Paragraph min size, if too small, it will merge
13+
1014 maxSize ?: number ;
1115 overlapRatio ?: number ;
1216 customReg ?: string [ ] ;
@@ -108,6 +112,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
108112 let {
109113 text = '' ,
110114 chunkSize,
115+ paragraphChunkDeep = 5 ,
116+ paragraphChunkMinSize = 100 ,
111117 maxSize = defaultMaxChunkSize ,
112118 overlapRatio = 0.15 ,
113119 customReg = [ ]
@@ -123,7 +129,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
123129 text = text . replace ( / ( ` ` ` [ \s \S ] * ?` ` ` | ~ ~ ~ [ \s \S ] * ?~ ~ ~ ) / g, function ( match ) {
124130 return match . replace ( / \n / g, codeBlockMarker ) ;
125131 } ) ;
126- // 2. 表格处理 - 单独提取表格出来,进行表头合并
132+ // 2. Markdown 表格处理 - 单独提取表格出来,进行表头合并
127133 const tableReg =
128134 / ( \n \| (?: (?: [ ^ \n | ] + \| ) { 1 , } ) \n \| (?: [: \- \s] + \| ) { 1 , } \n (?: \| (?: [ ^ \n | ] + \| ) * \n ? ) * ) (?: \n | $ ) / g;
129135 const tableDataList = text . match ( tableReg ) ;
@@ -143,25 +149,40 @@ const commonSplit = (props: SplitProps): SplitResponse => {
143149 text = text . replace ( / ( \r ? \n | \r ) { 3 , } / g, '\n\n\n' ) ;
144150
145151 // The larger maxLen is, the next sentence is less likely to trigger splitting
146- const markdownIndex = 4 ;
147- const forbidOverlapIndex = 8 ;
152+ const customRegLen = customReg . length ;
153+ const markdownIndex = paragraphChunkDeep - 1 ;
154+ const forbidOverlapIndex = customRegLen + markdownIndex + 4 ;
155+
156+ const markdownHeaderRules = ( ( deep ?: number ) : { reg : RegExp ; maxLen : number } [ ] => {
157+ if ( ! deep || deep === 0 ) return [ ] ;
158+
159+ const maxDeep = Math . min ( deep , 8 ) ; // Maximum 8 levels
160+ const rules : { reg : RegExp ; maxLen : number } [ ] = [ ] ;
161+
162+ for ( let i = 1 ; i <= maxDeep ; i ++ ) {
163+ const hashSymbols = '#' . repeat ( i ) ;
164+ rules . push ( {
165+ reg : new RegExp ( `^(${ hashSymbols } \\s[^\\n]+\\n)` , 'gm' ) ,
166+ maxLen : chunkSize
167+ } ) ;
168+ }
169+
170+ return rules ;
171+ } ) ( paragraphChunkDeep ) ;
148172
149173 const stepReges : { reg : RegExp | string ; maxLen : number } [ ] = [
150174 ...customReg . map ( ( text ) => ( {
151175 reg : text . replaceAll ( '\\n' , '\n' ) ,
152176 maxLen : chunkSize
153177 } ) ) ,
154- { reg : / ^ ( # \s [ ^ \n ] + \n ) / gm, maxLen : chunkSize } ,
155- { reg : / ^ ( # # \s [ ^ \n ] + \n ) / gm, maxLen : chunkSize } ,
156- { reg : / ^ ( # # # \s [ ^ \n ] + \n ) / gm, maxLen : chunkSize } ,
157- { reg : / ^ ( # # # # \s [ ^ \n ] + \n ) / gm, maxLen : chunkSize } ,
158- { reg : / ^ ( # # # # # \s [ ^ \n ] + \n ) / gm, maxLen : chunkSize } ,
178+ ...markdownHeaderRules ,
159179
160180 { reg : / ( [ \n ] ( ` ` ` [ \s \S ] * ?` ` ` | ~ ~ ~ [ \s \S ] * ?~ ~ ~ ) ) / g, maxLen : maxSize } , // code block
181+ // HTML Table tag 尽可能保障完整
161182 {
162183 reg : / ( \n \| (?: (?: [ ^ \n | ] + \| ) { 1 , } ) \n \| (?: [: \- \s] + \| ) { 1 , } \n (?: \| (?: [ ^ \n | ] + \| ) * \n ) * ) / g,
163- maxLen : Math . min ( chunkSize * 1.5 , maxSize )
164- } , // Table 尽可能保证完整性
184+ maxLen : chunkSize
185+ } , // Markdown Table 尽可能保证完整性
165186 { reg : / ( \n { 2 , } ) / g, maxLen : chunkSize } ,
166187 { reg : / ( [ \n ] ) / g, maxLen : chunkSize } ,
167188 // ------ There's no overlap on the top
@@ -172,12 +193,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
172193 { reg : / ( [ , ] | , \s ) / g, maxLen : chunkSize }
173194 ] ;
174195
175- const customRegLen = customReg . length ;
176196 const checkIsCustomStep = ( step : number ) => step < customRegLen ;
177197 const checkIsMarkdownSplit = ( step : number ) =>
178198 step >= customRegLen && step <= markdownIndex + customRegLen ;
179-
180- const checkForbidOverlap = ( step : number ) => step <= forbidOverlapIndex + customRegLen ;
199+ const checkForbidOverlap = ( step : number ) => step <= forbidOverlapIndex ;
181200
182201 // if use markdown title split, Separate record title
183202 const getSplitTexts = ( { text, step } : { text : string ; step : number } ) => {
@@ -301,6 +320,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
301320 const splitTexts = getSplitTexts ( { text, step } ) ;
302321
303322 const chunks : string [ ] = [ ] ;
323+
304324 for ( let i = 0 ; i < splitTexts . length ; i ++ ) {
305325 const item = splitTexts [ i ] ;
306326
@@ -443,7 +463,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
443463 */
444464export const splitText2Chunks = ( props : SplitProps ) : SplitResponse => {
445465 let { text = '' } = props ;
446- const start = Date . now ( ) ;
447466 const splitWithCustomSign = text . split ( CUSTOM_SPLIT_SIGN ) ;
448467
449468 const splitResult = splitWithCustomSign . map ( ( item ) => {
0 commit comments