mirror of
https://github.com/langgenius/dify.git
synced 2026-01-28 06:32:22 +08:00
223 lines
6.6 KiB
TypeScript
223 lines
6.6 KiB
TypeScript
import type { ParentMode, PreProcessingRule, ProcessRule, Rules } from '@/models/datasets'
|
|
import { useCallback, useState } from 'react'
|
|
import { ChunkingMode, ProcessMode } from '@/models/datasets'
|
|
import escape from './escape'
|
|
import unescape from './unescape'
|
|
|
|
// Constants
|
|
export const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
|
export const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
|
|
export const DEFAULT_OVERLAP = 50
|
|
export const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(
|
|
globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000',
|
|
10,
|
|
)
|
|
|
|
export type ParentChildConfig = {
|
|
chunkForContext: ParentMode
|
|
parent: {
|
|
delimiter: string
|
|
maxLength: number
|
|
}
|
|
child: {
|
|
delimiter: string
|
|
maxLength: number
|
|
}
|
|
}
|
|
|
|
export const defaultParentChildConfig: ParentChildConfig = {
|
|
chunkForContext: 'paragraph',
|
|
parent: {
|
|
delimiter: '\\n\\n',
|
|
maxLength: 1024,
|
|
},
|
|
child: {
|
|
delimiter: '\\n',
|
|
maxLength: 512,
|
|
},
|
|
}
|
|
|
|
export type UseSegmentationStateOptions = {
|
|
initialSegmentationType?: ProcessMode
|
|
}
|
|
|
|
export const useSegmentationState = (options: UseSegmentationStateOptions = {}) => {
|
|
const { initialSegmentationType } = options
|
|
|
|
// Segmentation type (general or parent-child)
|
|
const [segmentationType, setSegmentationType] = useState<ProcessMode>(
|
|
initialSegmentationType ?? ProcessMode.general,
|
|
)
|
|
|
|
// General chunking settings
|
|
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
|
|
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH)
|
|
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
|
|
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
|
|
|
|
// Pre-processing rules
|
|
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
|
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
|
|
|
// Parent-child config
|
|
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
|
|
|
|
// Escaped segment identifier setter
|
|
const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
|
|
if (value) {
|
|
doSetSegmentIdentifier(escape(value))
|
|
}
|
|
else {
|
|
doSetSegmentIdentifier(canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)
|
|
}
|
|
}, [])
|
|
|
|
// Rule toggle handler
|
|
const toggleRule = useCallback((id: string) => {
|
|
setRules(prev => prev.map(rule =>
|
|
rule.id === id ? { ...rule, enabled: !rule.enabled } : rule,
|
|
))
|
|
}, [])
|
|
|
|
// Reset to defaults
|
|
const resetToDefaults = useCallback(() => {
|
|
if (defaultConfig) {
|
|
setSegmentIdentifier(defaultConfig.segmentation.separator)
|
|
setMaxChunkLength(defaultConfig.segmentation.max_tokens)
|
|
setOverlap(defaultConfig.segmentation.chunk_overlap!)
|
|
setRules(defaultConfig.pre_processing_rules)
|
|
}
|
|
setParentChildConfig(defaultParentChildConfig)
|
|
}, [defaultConfig, setSegmentIdentifier])
|
|
|
|
// Apply config from document detail
|
|
const applyConfigFromRules = useCallback((rulesConfig: Rules, isHierarchical: boolean) => {
|
|
const separator = rulesConfig.segmentation.separator
|
|
const max = rulesConfig.segmentation.max_tokens
|
|
const chunkOverlap = rulesConfig.segmentation.chunk_overlap
|
|
|
|
setSegmentIdentifier(separator)
|
|
setMaxChunkLength(max)
|
|
setOverlap(chunkOverlap!)
|
|
setRules(rulesConfig.pre_processing_rules)
|
|
setDefaultConfig(rulesConfig)
|
|
|
|
if (isHierarchical) {
|
|
setParentChildConfig({
|
|
chunkForContext: rulesConfig.parent_mode || 'paragraph',
|
|
parent: {
|
|
delimiter: escape(rulesConfig.segmentation.separator),
|
|
maxLength: rulesConfig.segmentation.max_tokens,
|
|
},
|
|
child: {
|
|
delimiter: escape(rulesConfig.subchunk_segmentation!.separator),
|
|
maxLength: rulesConfig.subchunk_segmentation!.max_tokens,
|
|
},
|
|
})
|
|
}
|
|
}, [setSegmentIdentifier])
|
|
|
|
// Get process rule for API
|
|
const getProcessRule = useCallback((docForm: ChunkingMode): ProcessRule => {
|
|
if (docForm === ChunkingMode.parentChild) {
|
|
return {
|
|
rules: {
|
|
pre_processing_rules: rules,
|
|
segmentation: {
|
|
separator: unescape(parentChildConfig.parent.delimiter),
|
|
max_tokens: parentChildConfig.parent.maxLength,
|
|
},
|
|
parent_mode: parentChildConfig.chunkForContext,
|
|
subchunk_segmentation: {
|
|
separator: unescape(parentChildConfig.child.delimiter),
|
|
max_tokens: parentChildConfig.child.maxLength,
|
|
},
|
|
},
|
|
mode: 'hierarchical',
|
|
} as ProcessRule
|
|
}
|
|
|
|
return {
|
|
rules: {
|
|
pre_processing_rules: rules,
|
|
segmentation: {
|
|
separator: unescape(segmentIdentifier),
|
|
max_tokens: maxChunkLength,
|
|
chunk_overlap: overlap,
|
|
},
|
|
},
|
|
mode: segmentationType,
|
|
} as ProcessRule
|
|
}, [rules, parentChildConfig, segmentIdentifier, maxChunkLength, overlap, segmentationType])
|
|
|
|
// Update parent config field
|
|
const updateParentConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
|
|
setParentChildConfig((prev) => {
|
|
let newValue: string | number
|
|
if (field === 'delimiter')
|
|
newValue = value ? escape(value as string) : ''
|
|
else
|
|
newValue = value
|
|
return {
|
|
...prev,
|
|
parent: { ...prev.parent, [field]: newValue },
|
|
}
|
|
})
|
|
}, [])
|
|
|
|
// Update child config field
|
|
const updateChildConfig = useCallback((field: 'delimiter' | 'maxLength', value: string | number) => {
|
|
setParentChildConfig((prev) => {
|
|
let newValue: string | number
|
|
if (field === 'delimiter')
|
|
newValue = value ? escape(value as string) : ''
|
|
else
|
|
newValue = value
|
|
return {
|
|
...prev,
|
|
child: { ...prev.child, [field]: newValue },
|
|
}
|
|
})
|
|
}, [])
|
|
|
|
// Set chunk for context mode
|
|
const setChunkForContext = useCallback((mode: ParentMode) => {
|
|
setParentChildConfig(prev => ({ ...prev, chunkForContext: mode }))
|
|
}, [])
|
|
|
|
return {
|
|
// General chunking state
|
|
segmentationType,
|
|
setSegmentationType,
|
|
segmentIdentifier,
|
|
setSegmentIdentifier,
|
|
maxChunkLength,
|
|
setMaxChunkLength,
|
|
limitMaxChunkLength,
|
|
setLimitMaxChunkLength,
|
|
overlap,
|
|
setOverlap,
|
|
|
|
// Rules
|
|
rules,
|
|
setRules,
|
|
defaultConfig,
|
|
setDefaultConfig,
|
|
toggleRule,
|
|
|
|
// Parent-child config
|
|
parentChildConfig,
|
|
setParentChildConfig,
|
|
updateParentConfig,
|
|
updateChildConfig,
|
|
setChunkForContext,
|
|
|
|
// Actions
|
|
resetToDefaults,
|
|
applyConfigFromRules,
|
|
getProcessRule,
|
|
}
|
|
}
|
|
|
|
export type SegmentationState = ReturnType<typeof useSegmentationState>
|