feat(embedding-process): implement embedding process components and polling logic (#30622)
Some checks are pending
autofix.ci / autofix (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/amd64, build-api-amd64) (push) Waiting to run
Build and Push API & Web / build (api, DIFY_API_IMAGE_NAME, linux/arm64, build-api-arm64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/amd64, build-web-amd64) (push) Waiting to run
Build and Push API & Web / build (web, DIFY_WEB_IMAGE_NAME, linux/arm64, build-web-arm64) (push) Waiting to run
Build and Push API & Web / create-manifest (api, DIFY_API_IMAGE_NAME, merge-api-images) (push) Blocked by required conditions
Build and Push API & Web / create-manifest (web, DIFY_WEB_IMAGE_NAME, merge-web-images) (push) Blocked by required conditions
Main CI Pipeline / Check Changed Files (push) Waiting to run
Main CI Pipeline / API Tests (push) Blocked by required conditions
Main CI Pipeline / Web Tests (push) Blocked by required conditions
Main CI Pipeline / Style Check (push) Waiting to run
Main CI Pipeline / VDB Tests (push) Blocked by required conditions
Main CI Pipeline / DB Migration Test (push) Blocked by required conditions

Co-authored-by: CodingOnStar <hanxujiang@dify.ai>
This commit is contained in:
Coding On Star 2026-01-09 10:21:27 +08:00 committed by GitHub
parent 9848823dcd
commit 98df99b0ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 2086 additions and 325 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,47 +1,29 @@
import type { FC } from 'react'
import type {
DataSourceInfo,
FullDocumentDetail,
IndexingStatusResponse,
LegacyDataSourceInfo,
ProcessRuleResponse,
} from '@/models/datasets'
import type { FullDocumentDetail } from '@/models/datasets'
import type { RETRIEVE_METHOD } from '@/types/app'
import {
RiArrowRightLine,
RiCheckboxCircleFill,
RiErrorWarningFill,
RiLoader2Fill,
RiTerminalBoxLine,
} from '@remixicon/react'
import Image from 'next/image'
import Link from 'next/link'
import { useRouter } from 'next/navigation'
import * as React from 'react'
import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
import { useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import Button from '@/app/components/base/button'
import Divider from '@/app/components/base/divider'
import { ZapFast } from '@/app/components/base/icons/src/vender/solid/general'
import NotionIcon from '@/app/components/base/notion-icon'
import Tooltip from '@/app/components/base/tooltip'
import PriorityLabel from '@/app/components/billing/priority-label'
import { Plan } from '@/app/components/billing/type'
import UpgradeBtn from '@/app/components/billing/upgrade-btn'
import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata'
import { useProviderContext } from '@/context/provider-context'
import { useDatasetApiAccessUrl } from '@/hooks/use-api-access-url'
import { DataSourceType, ProcessMode } from '@/models/datasets'
import { fetchIndexingStatusBatch as doFetchIndexingStatus } from '@/service/datasets'
import { useProcessRule } from '@/service/knowledge/use-dataset'
import { useInvalidDocumentList } from '@/service/knowledge/use-document'
import { RETRIEVE_METHOD } from '@/types/app'
import { sleep } from '@/utils'
import { cn } from '@/utils/classnames'
import DocumentFileIcon from '../../common/document-file-icon'
import { indexMethodIcon, retrievalIcon } from '../icons'
import { IndexingType } from '../step-two'
import IndexingProgressItem from './indexing-progress-item'
import RuleDetail from './rule-detail'
import UpgradeBanner from './upgrade-banner'
import { useIndexingStatusPolling } from './use-indexing-status-polling'
import { createDocumentLookup } from './utils'
type Props = {
type EmbeddingProcessProps = {
datasetId: string
batchId: string
documents?: FullDocumentDetail[]
@ -49,220 +31,14 @@ type Props = {
retrievalMethod?: RETRIEVE_METHOD
}
const RuleDetail: FC<{
sourceData?: ProcessRuleResponse
indexingType?: string
retrievalMethod?: RETRIEVE_METHOD
}> = ({ sourceData, indexingType, retrievalMethod }) => {
// Status header component
const StatusHeader: FC<{ isEmbedding: boolean, isCompleted: boolean }> = ({
isEmbedding,
isCompleted,
}) => {
const { t } = useTranslation()
const segmentationRuleMap = {
mode: t('embedding.mode', { ns: 'datasetDocuments' }),
segmentLength: t('embedding.segmentLength', { ns: 'datasetDocuments' }),
textCleaning: t('embedding.textCleaning', { ns: 'datasetDocuments' }),
}
const getRuleName = (key: string) => {
if (key === 'remove_extra_spaces')
return t('stepTwo.removeExtraSpaces', { ns: 'datasetCreation' })
if (key === 'remove_urls_emails')
return t('stepTwo.removeUrlEmails', { ns: 'datasetCreation' })
if (key === 'remove_stopwords')
return t('stepTwo.removeStopwords', { ns: 'datasetCreation' })
}
const isNumber = (value: unknown) => {
return typeof value === 'number'
}
const getValue = useCallback((field: string) => {
let value: string | number | undefined = '-'
const maxTokens = isNumber(sourceData?.rules?.segmentation?.max_tokens)
? sourceData.rules.segmentation.max_tokens
: value
const childMaxTokens = isNumber(sourceData?.rules?.subchunk_segmentation?.max_tokens)
? sourceData.rules.subchunk_segmentation.max_tokens
: value
switch (field) {
case 'mode':
value = !sourceData?.mode
? value
: sourceData.mode === ProcessMode.general
? (t('embedding.custom', { ns: 'datasetDocuments' }) as string)
: `${t('embedding.hierarchical', { ns: 'datasetDocuments' })} · ${sourceData?.rules?.parent_mode === 'paragraph'
? t('parentMode.paragraph', { ns: 'dataset' })
: t('parentMode.fullDoc', { ns: 'dataset' })}`
break
case 'segmentLength':
value = !sourceData?.mode
? value
: sourceData.mode === ProcessMode.general
? maxTokens
: `${t('embedding.parentMaxTokens', { ns: 'datasetDocuments' })} ${maxTokens}; ${t('embedding.childMaxTokens', { ns: 'datasetDocuments' })} ${childMaxTokens}`
break
default:
value = !sourceData?.mode
? value
: sourceData?.rules?.pre_processing_rules?.filter(rule =>
rule.enabled).map(rule => getRuleName(rule.id)).join(',')
break
}
return value
}, [sourceData])
return (
<div className="flex flex-col gap-1">
{Object.keys(segmentationRuleMap).map((field) => {
return (
<FieldInfo
key={field}
label={segmentationRuleMap[field as keyof typeof segmentationRuleMap]}
displayedValue={String(getValue(field))}
/>
)
})}
<FieldInfo
label={t('stepTwo.indexMode', { ns: 'datasetCreation' })}
displayedValue={t(`stepTwo.${indexingType === IndexingType.ECONOMICAL ? 'economical' : 'qualified'}`, { ns: 'datasetCreation' }) as string}
valueIcon={(
<Image
className="size-4"
src={
indexingType === IndexingType.ECONOMICAL
? indexMethodIcon.economical
: indexMethodIcon.high_quality
}
alt=""
/>
)}
/>
<FieldInfo
label={t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
// displayedValue={t(`datasetSettings.form.retrievalSetting.${retrievalMethod}`) as string}
displayedValue={t(`retrieval.${indexingType === IndexingType.ECONOMICAL ? 'keyword_search' : retrievalMethod ?? 'semantic_search'}.title`, { ns: 'dataset' })}
valueIcon={(
<Image
className="size-4"
src={
retrievalMethod === RETRIEVE_METHOD.fullText
? retrievalIcon.fullText
: retrievalMethod === RETRIEVE_METHOD.hybrid
? retrievalIcon.hybrid
: retrievalIcon.vector
}
alt=""
/>
)}
/>
</div>
)
}
const EmbeddingProcess: FC<Props> = ({ datasetId, batchId, documents = [], indexingType, retrievalMethod }) => {
const { t } = useTranslation()
const { enableBilling, plan } = useProviderContext()
const getFirstDocument = documents[0]
const [indexingStatusBatchDetail, setIndexingStatusDetail] = useState<IndexingStatusResponse[]>([])
const fetchIndexingStatus = async () => {
const status = await doFetchIndexingStatus({ datasetId, batchId })
setIndexingStatusDetail(status.data)
return status.data
}
const [isStopQuery, setIsStopQuery] = useState(false)
const isStopQueryRef = useRef(isStopQuery)
useEffect(() => {
isStopQueryRef.current = isStopQuery
}, [isStopQuery])
const stopQueryStatus = () => {
setIsStopQuery(true)
}
const startQueryStatus = async () => {
if (isStopQueryRef.current)
return
try {
const indexingStatusBatchDetail = await fetchIndexingStatus()
const isCompleted = indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail.indexing_status))
if (isCompleted) {
stopQueryStatus()
return
}
await sleep(2500)
await startQueryStatus()
}
catch {
await sleep(2500)
await startQueryStatus()
}
}
useEffect(() => {
setIsStopQuery(false)
startQueryStatus()
return () => {
stopQueryStatus()
}
}, [])
// get rule
const { data: ruleDetail } = useProcessRule(getFirstDocument?.id)
const router = useRouter()
const invalidDocumentList = useInvalidDocumentList()
const navToDocumentList = () => {
invalidDocumentList()
router.push(`/datasets/${datasetId}/documents`)
}
const apiReferenceUrl = useDatasetApiAccessUrl()
const isEmbedding = useMemo(() => {
return indexingStatusBatchDetail.some(indexingStatusDetail => ['indexing', 'splitting', 'parsing', 'cleaning'].includes(indexingStatusDetail?.indexing_status || ''))
}, [indexingStatusBatchDetail])
const isEmbeddingCompleted = useMemo(() => {
return indexingStatusBatchDetail.every(indexingStatusDetail => ['completed', 'error', 'paused'].includes(indexingStatusDetail?.indexing_status || ''))
}, [indexingStatusBatchDetail])
const getSourceName = (id: string) => {
const doc = documents.find(document => document.id === id)
return doc?.name
}
const getFileType = (name?: string) => name?.split('.').pop() || 'txt'
const getSourcePercent = (detail: IndexingStatusResponse) => {
const completedCount = detail.completed_segments || 0
const totalCount = detail.total_segments || 0
if (totalCount === 0)
return 0
const percent = Math.round(completedCount * 100 / totalCount)
return percent > 100 ? 100 : percent
}
const getSourceType = (id: string) => {
const doc = documents.find(document => document.id === id)
return doc?.data_source_type as DataSourceType
}
const isLegacyDataSourceInfo = (info: DataSourceInfo): info is LegacyDataSourceInfo => {
return info != null && typeof (info as LegacyDataSourceInfo).upload_file === 'object'
}
const getIcon = (id: string) => {
const doc = documents.find(document => document.id === id)
const info = doc?.data_source_info
if (info && isLegacyDataSourceInfo(info))
return info.notion_page_icon
return undefined
}
const isSourceEmbedding = (detail: IndexingStatusResponse) =>
['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'].includes(detail.indexing_status || '')
return (
<>
<div className="flex flex-col gap-y-3">
<div className="system-md-semibold-uppercase flex items-center gap-x-1 text-text-secondary">
{isEmbedding && (
<>
@ -270,99 +46,22 @@ const EmbeddingProcess: FC<Props> = ({ datasetId, batchId, documents = [], index
<span>{t('embedding.processing', { ns: 'datasetDocuments' })}</span>
</>
)}
{isEmbeddingCompleted && t('embedding.completed', { ns: 'datasetDocuments' })}
</div>
{
enableBilling && plan.type !== Plan.team && (
<div className="flex h-14 items-center rounded-xl border-[0.5px] border-black/5 bg-white p-3 shadow-md">
<div className="flex h-8 w-8 shrink-0 items-center justify-center rounded-lg bg-[#FFF6ED]">
<ZapFast className="h-4 w-4 text-[#FB6514]" />
</div>
<div className="mx-3 grow text-[13px] font-medium text-gray-700">
{t('plansCommon.documentProcessingPriorityUpgrade', { ns: 'billing' })}
</div>
<UpgradeBtn loc="knowledge-speed-up" />
{isCompleted && t('embedding.completed', { ns: 'datasetDocuments' })}
</div>
)
}
<div className="flex flex-col gap-0.5 pb-2">
{indexingStatusBatchDetail.map(indexingStatusDetail => (
<div
key={indexingStatusDetail.id}
className={cn(
'relative h-[26px] overflow-hidden rounded-md bg-components-progress-bar-bg',
indexingStatusDetail.indexing_status === 'error' && 'bg-state-destructive-hover-alt',
)}
>
{isSourceEmbedding(indexingStatusDetail) && (
<div
className="absolute left-0 top-0 h-full min-w-0.5 border-r-[2px] border-r-components-progress-bar-progress-highlight bg-components-progress-bar-progress"
style={{ width: `${getSourcePercent(indexingStatusDetail)}%` }}
/>
)}
<div className="z-[1] flex h-full items-center gap-1 pl-[6px] pr-2">
{getSourceType(indexingStatusDetail.id) === DataSourceType.FILE && (
<DocumentFileIcon
size="sm"
className="shrink-0"
name={getSourceName(indexingStatusDetail.id)}
extension={getFileType(getSourceName(indexingStatusDetail.id))}
/>
)}
{getSourceType(indexingStatusDetail.id) === DataSourceType.NOTION && (
<NotionIcon
className="shrink-0"
type="page"
src={getIcon(indexingStatusDetail.id)}
/>
)}
<div className="flex w-0 grow items-center gap-1" title={getSourceName(indexingStatusDetail.id)}>
<div className="system-xs-medium truncate text-text-secondary">
{getSourceName(indexingStatusDetail.id)}
</div>
{
enableBilling && (
<PriorityLabel className="ml-0" />
)
}
</div>
{isSourceEmbedding(indexingStatusDetail) && (
<div className="shrink-0 text-xs text-text-secondary">{`${getSourcePercent(indexingStatusDetail)}%`}</div>
)}
{indexingStatusDetail.indexing_status === 'error' && (
<Tooltip
popupClassName="px-4 py-[14px] max-w-60 body-xs-regular text-text-secondary border-[0.5px] border-components-panel-border rounded-xl"
offset={4}
popupContent={indexingStatusDetail.error}
>
<span>
<RiErrorWarningFill className="size-4 shrink-0 text-text-destructive" />
</span>
</Tooltip>
)}
{indexingStatusDetail.indexing_status === 'completed' && (
<RiCheckboxCircleFill className="size-4 shrink-0 text-text-success" />
)}
</div>
</div>
))}
</div>
<Divider type="horizontal" className="my-0 bg-divider-subtle" />
<RuleDetail
sourceData={ruleDetail}
indexingType={indexingType}
retrievalMethod={retrievalMethod}
/>
</div>
// Action buttons component
const ActionButtons: FC<{
apiReferenceUrl: string
onNavToDocuments: () => void
}> = ({ apiReferenceUrl, onNavToDocuments }) => {
const { t } = useTranslation()
return (
<div className="mt-6 flex items-center gap-x-2 py-2">
<Link
href={apiReferenceUrl}
target="_blank"
rel="noopener noreferrer"
>
<Button
className="w-fit gap-x-0.5 px-3"
>
<Link href={apiReferenceUrl} target="_blank" rel="noopener noreferrer">
<Button className="w-fit gap-x-0.5 px-3">
<RiTerminalBoxLine className="size-4" />
<span className="px-0.5">Access the API</span>
</Button>
@ -370,12 +69,83 @@ const EmbeddingProcess: FC<Props> = ({ datasetId, batchId, documents = [], index
<Button
className="w-fit gap-x-0.5 px-3"
variant="primary"
onClick={navToDocumentList}
onClick={onNavToDocuments}
>
<span className="px-0.5">{t('stepThree.navTo', { ns: 'datasetCreation' })}</span>
<RiArrowRightLine className="size-4 stroke-current stroke-1" />
</Button>
</div>
)
}
const EmbeddingProcess: FC<EmbeddingProcessProps> = ({
datasetId,
batchId,
documents = [],
indexingType,
retrievalMethod,
}) => {
const { enableBilling, plan } = useProviderContext()
const router = useRouter()
const invalidDocumentList = useInvalidDocumentList()
const apiReferenceUrl = useDatasetApiAccessUrl()
// Polling hook for indexing status
const { statusList, isEmbedding, isEmbeddingCompleted } = useIndexingStatusPolling({
datasetId,
batchId,
})
// Get process rule for the first document
const firstDocumentId = documents[0]?.id
const { data: ruleDetail } = useProcessRule(firstDocumentId)
// Document lookup utilities - memoized for performance
const documentLookup = useMemo(
() => createDocumentLookup(documents),
[documents],
)
const handleNavToDocuments = () => {
invalidDocumentList()
router.push(`/datasets/${datasetId}/documents`)
}
const showUpgradeBanner = enableBilling && plan.type !== Plan.team
return (
<>
<div className="flex flex-col gap-y-3">
<StatusHeader isEmbedding={isEmbedding} isCompleted={isEmbeddingCompleted} />
{showUpgradeBanner && <UpgradeBanner />}
<div className="flex flex-col gap-0.5 pb-2">
{statusList.map(detail => (
<IndexingProgressItem
key={detail.id}
detail={detail}
name={documentLookup.getName(detail.id)}
sourceType={documentLookup.getSourceType(detail.id)}
notionIcon={documentLookup.getNotionIcon(detail.id)}
enableBilling={enableBilling}
/>
))}
</div>
<Divider type="horizontal" className="my-0 bg-divider-subtle" />
<RuleDetail
sourceData={ruleDetail}
indexingType={indexingType}
retrievalMethod={retrievalMethod}
/>
</div>
<ActionButtons
apiReferenceUrl={apiReferenceUrl}
onNavToDocuments={handleNavToDocuments}
/>
</>
)
}

View File

@ -0,0 +1,120 @@
import type { FC } from 'react'
import type { IndexingStatusResponse } from '@/models/datasets'
import {
RiCheckboxCircleFill,
RiErrorWarningFill,
} from '@remixicon/react'
import NotionIcon from '@/app/components/base/notion-icon'
import Tooltip from '@/app/components/base/tooltip'
import PriorityLabel from '@/app/components/billing/priority-label'
import { DataSourceType } from '@/models/datasets'
import { cn } from '@/utils/classnames'
import DocumentFileIcon from '../../common/document-file-icon'
import { getFileType, getSourcePercent, isSourceEmbedding } from './utils'
type IndexingProgressItemProps = {
detail: IndexingStatusResponse
name?: string
sourceType?: DataSourceType
notionIcon?: string
enableBilling?: boolean
}
// Status icon component for completed/error states
const StatusIcon: FC<{ status: string, error?: string }> = ({ status, error }) => {
if (status === 'completed')
return <RiCheckboxCircleFill className="size-4 shrink-0 text-text-success" />
if (status === 'error') {
return (
<Tooltip
popupClassName="px-4 py-[14px] max-w-60 body-xs-regular text-text-secondary border-[0.5px] border-components-panel-border rounded-xl"
offset={4}
popupContent={error}
>
<span>
<RiErrorWarningFill className="size-4 shrink-0 text-text-destructive" />
</span>
</Tooltip>
)
}
return null
}
// Source type icon component
const SourceTypeIcon: FC<{
sourceType?: DataSourceType
name?: string
notionIcon?: string
}> = ({ sourceType, name, notionIcon }) => {
if (sourceType === DataSourceType.FILE) {
return (
<DocumentFileIcon
size="sm"
className="shrink-0"
name={name}
extension={getFileType(name)}
/>
)
}
if (sourceType === DataSourceType.NOTION) {
return (
<NotionIcon
className="shrink-0"
type="page"
src={notionIcon}
/>
)
}
return null
}
const IndexingProgressItem: FC<IndexingProgressItemProps> = ({
detail,
name,
sourceType,
notionIcon,
enableBilling,
}) => {
const isEmbedding = isSourceEmbedding(detail)
const percent = getSourcePercent(detail)
const isError = detail.indexing_status === 'error'
return (
<div
className={cn(
'relative h-[26px] overflow-hidden rounded-md bg-components-progress-bar-bg',
isError && 'bg-state-destructive-hover-alt',
)}
>
{isEmbedding && (
<div
className="absolute left-0 top-0 h-full min-w-0.5 border-r-[2px] border-r-components-progress-bar-progress-highlight bg-components-progress-bar-progress"
style={{ width: `${percent}%` }}
/>
)}
<div className="z-[1] flex h-full items-center gap-1 pl-[6px] pr-2">
<SourceTypeIcon
sourceType={sourceType}
name={name}
notionIcon={notionIcon}
/>
<div className="flex w-0 grow items-center gap-1" title={name}>
<div className="system-xs-medium truncate text-text-secondary">
{name}
</div>
{enableBilling && <PriorityLabel className="ml-0" />}
</div>
{isEmbedding && (
<div className="shrink-0 text-xs text-text-secondary">{`${percent}%`}</div>
)}
<StatusIcon status={detail.indexing_status} error={detail.error} />
</div>
</div>
)
}
export default IndexingProgressItem

View File

@ -0,0 +1,133 @@
import type { FC } from 'react'
import type { ProcessRuleResponse } from '@/models/datasets'
import Image from 'next/image'
import { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata'
import { ProcessMode } from '@/models/datasets'
import { RETRIEVE_METHOD } from '@/types/app'
import { indexMethodIcon, retrievalIcon } from '../icons'
import { IndexingType } from '../step-two'
type RuleDetailProps = {
sourceData?: ProcessRuleResponse
indexingType?: string
retrievalMethod?: RETRIEVE_METHOD
}
// Lookup table for pre-processing rule names
const PRE_PROCESSING_RULE_KEYS = {
remove_extra_spaces: 'stepTwo.removeExtraSpaces',
remove_urls_emails: 'stepTwo.removeUrlEmails',
remove_stopwords: 'stepTwo.removeStopwords',
} as const
// Lookup table for retrieval method icons
const RETRIEVAL_ICON_MAP: Partial<Record<RETRIEVE_METHOD, string>> = {
[RETRIEVE_METHOD.fullText]: retrievalIcon.fullText,
[RETRIEVE_METHOD.hybrid]: retrievalIcon.hybrid,
[RETRIEVE_METHOD.semantic]: retrievalIcon.vector,
[RETRIEVE_METHOD.invertedIndex]: retrievalIcon.fullText,
[RETRIEVE_METHOD.keywordSearch]: retrievalIcon.fullText,
}
const isNumber = (value: unknown): value is number => typeof value === 'number'
const RuleDetail: FC<RuleDetailProps> = ({ sourceData, indexingType, retrievalMethod }) => {
const { t } = useTranslation()
const segmentationRuleLabels = {
mode: t('embedding.mode', { ns: 'datasetDocuments' }),
segmentLength: t('embedding.segmentLength', { ns: 'datasetDocuments' }),
textCleaning: t('embedding.textCleaning', { ns: 'datasetDocuments' }),
}
const getRuleName = useCallback((key: string): string | undefined => {
const translationKey = PRE_PROCESSING_RULE_KEYS[key as keyof typeof PRE_PROCESSING_RULE_KEYS]
return translationKey ? t(translationKey, { ns: 'datasetCreation' }) : undefined
}, [t])
const getModeValue = useCallback((): string => {
if (!sourceData?.mode)
return '-'
if (sourceData.mode === ProcessMode.general)
return t('embedding.custom', { ns: 'datasetDocuments' })
const parentModeLabel = sourceData.rules?.parent_mode === 'paragraph'
? t('parentMode.paragraph', { ns: 'dataset' })
: t('parentMode.fullDoc', { ns: 'dataset' })
return `${t('embedding.hierarchical', { ns: 'datasetDocuments' })} · ${parentModeLabel}`
}, [sourceData, t])
const getSegmentLengthValue = useCallback((): string | number => {
if (!sourceData?.mode)
return '-'
const maxTokens = isNumber(sourceData.rules?.segmentation?.max_tokens)
? sourceData.rules.segmentation.max_tokens
: '-'
if (sourceData.mode === ProcessMode.general)
return maxTokens
const childMaxTokens = isNumber(sourceData.rules?.subchunk_segmentation?.max_tokens)
? sourceData.rules.subchunk_segmentation.max_tokens
: '-'
return `${t('embedding.parentMaxTokens', { ns: 'datasetDocuments' })} ${maxTokens}; ${t('embedding.childMaxTokens', { ns: 'datasetDocuments' })} ${childMaxTokens}`
}, [sourceData, t])
const getTextCleaningValue = useCallback((): string => {
if (!sourceData?.mode)
return '-'
const enabledRules = sourceData.rules?.pre_processing_rules?.filter(rule => rule.enabled) || []
const ruleNames = enabledRules
.map((rule) => {
const name = getRuleName(rule.id)
return typeof name === 'string' ? name : ''
})
.filter(name => name)
return ruleNames.length > 0 ? ruleNames.join(',') : '-'
}, [sourceData, getRuleName])
const fieldValueGetters: Record<string, () => string | number> = {
mode: getModeValue,
segmentLength: getSegmentLengthValue,
textCleaning: getTextCleaningValue,
}
const isEconomical = indexingType === IndexingType.ECONOMICAL
const indexMethodIconSrc = isEconomical ? indexMethodIcon.economical : indexMethodIcon.high_quality
const indexModeLabel = t(`stepTwo.${isEconomical ? 'economical' : 'qualified'}`, { ns: 'datasetCreation' })
const effectiveRetrievalMethod = isEconomical ? 'keyword_search' : (retrievalMethod ?? 'semantic_search')
const retrievalLabel = t(`retrieval.${effectiveRetrievalMethod}.title`, { ns: 'dataset' })
const retrievalIconSrc = RETRIEVAL_ICON_MAP[retrievalMethod as keyof typeof RETRIEVAL_ICON_MAP] ?? retrievalIcon.vector
return (
<div className="flex flex-col gap-1">
{Object.keys(segmentationRuleLabels).map(field => (
<FieldInfo
key={field}
label={segmentationRuleLabels[field as keyof typeof segmentationRuleLabels]}
displayedValue={String(fieldValueGetters[field]())}
/>
))}
<FieldInfo
label={t('stepTwo.indexMode', { ns: 'datasetCreation' })}
displayedValue={indexModeLabel}
valueIcon={<Image className="size-4" src={indexMethodIconSrc} alt="" />}
/>
<FieldInfo
label={t('form.retrievalSetting.title', { ns: 'datasetSettings' })}
displayedValue={retrievalLabel}
valueIcon={<Image className="size-4" src={retrievalIconSrc} alt="" />}
/>
</div>
)
}
export default RuleDetail

View File

@ -0,0 +1,22 @@
import type { FC } from 'react'
import { useTranslation } from 'react-i18next'
import { ZapFast } from '@/app/components/base/icons/src/vender/solid/general'
import UpgradeBtn from '@/app/components/billing/upgrade-btn'
const UpgradeBanner: FC = () => {
const { t } = useTranslation()
return (
<div className="flex h-14 items-center rounded-xl border-[0.5px] border-black/5 bg-white p-3 shadow-md">
<div className="flex h-8 w-8 shrink-0 items-center justify-center rounded-lg bg-[#FFF6ED]">
<ZapFast className="h-4 w-4 text-[#FB6514]" />
</div>
<div className="mx-3 grow text-[13px] font-medium text-gray-700">
{t('plansCommon.documentProcessingPriorityUpgrade', { ns: 'billing' })}
</div>
<UpgradeBtn loc="knowledge-speed-up" />
</div>
)
}
export default UpgradeBanner

View File

@ -0,0 +1,90 @@
import type { IndexingStatusResponse } from '@/models/datasets'
import { useEffect, useRef, useState } from 'react'
import { fetchIndexingStatusBatch } from '@/service/datasets'
const POLLING_INTERVAL = 2500
const COMPLETED_STATUSES = ['completed', 'error', 'paused'] as const
const EMBEDDING_STATUSES = ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'] as const
type IndexingStatusPollingParams = {
datasetId: string
batchId: string
}
type IndexingStatusPollingResult = {
statusList: IndexingStatusResponse[]
isEmbedding: boolean
isEmbeddingCompleted: boolean
}
const isStatusCompleted = (status: string): boolean =>
COMPLETED_STATUSES.includes(status as typeof COMPLETED_STATUSES[number])
const isAllCompleted = (statusList: IndexingStatusResponse[]): boolean =>
statusList.every(item => isStatusCompleted(item.indexing_status))
/**
* Custom hook for polling indexing status with automatic stop on completion.
* Handles the polling lifecycle and provides derived states for UI rendering.
*/
export const useIndexingStatusPolling = ({
datasetId,
batchId,
}: IndexingStatusPollingParams): IndexingStatusPollingResult => {
const [statusList, setStatusList] = useState<IndexingStatusResponse[]>([])
const isStopPollingRef = useRef(false)
useEffect(() => {
// Reset polling state on mount
isStopPollingRef.current = false
let timeoutId: ReturnType<typeof setTimeout> | null = null
const fetchStatus = async (): Promise<IndexingStatusResponse[]> => {
const response = await fetchIndexingStatusBatch({ datasetId, batchId })
setStatusList(response.data)
return response.data
}
const poll = async (): Promise<void> => {
if (isStopPollingRef.current)
return
try {
const data = await fetchStatus()
if (isAllCompleted(data)) {
isStopPollingRef.current = true
return
}
}
catch {
// Continue polling on error
}
if (!isStopPollingRef.current) {
timeoutId = setTimeout(() => {
poll()
}, POLLING_INTERVAL)
}
}
poll()
return () => {
isStopPollingRef.current = true
if (timeoutId)
clearTimeout(timeoutId)
}
}, [datasetId, batchId])
const isEmbedding = statusList.some(item =>
EMBEDDING_STATUSES.includes(item?.indexing_status as typeof EMBEDDING_STATUSES[number]),
)
const isEmbeddingCompleted = statusList.length > 0 && isAllCompleted(statusList)
return {
statusList,
isEmbedding,
isEmbeddingCompleted,
}
}

View File

@ -0,0 +1,64 @@
import type {
DataSourceInfo,
DataSourceType,
FullDocumentDetail,
IndexingStatusResponse,
LegacyDataSourceInfo,
} from '@/models/datasets'
const EMBEDDING_STATUSES = ['indexing', 'splitting', 'parsing', 'cleaning', 'waiting'] as const
/**
* Type guard for legacy data source info with upload_file property
*/
export const isLegacyDataSourceInfo = (info: DataSourceInfo): info is LegacyDataSourceInfo => {
return info != null && typeof (info as LegacyDataSourceInfo).upload_file === 'object'
}
/**
* Check if a status indicates the source is being embedded
*/
export const isSourceEmbedding = (detail: IndexingStatusResponse): boolean =>
EMBEDDING_STATUSES.includes(detail.indexing_status as typeof EMBEDDING_STATUSES[number])
/**
* Calculate the progress percentage for a document
*/
export const getSourcePercent = (detail: IndexingStatusResponse): number => {
const completedCount = detail.completed_segments || 0
const totalCount = detail.total_segments || 0
if (totalCount === 0)
return 0
const percent = Math.round(completedCount * 100 / totalCount)
return Math.min(percent, 100)
}
/**
* Get file extension from filename, defaults to 'txt'
*/
export const getFileType = (name?: string): string =>
name?.split('.').pop() || 'txt'
/**
* Document lookup utilities - provides document info by ID from a list
*/
export const createDocumentLookup = (documents: FullDocumentDetail[]) => {
const documentMap = new Map(documents.map(doc => [doc.id, doc]))
return {
getDocument: (id: string) => documentMap.get(id),
getName: (id: string) => documentMap.get(id)?.name,
getSourceType: (id: string) => documentMap.get(id)?.data_source_type as DataSourceType | undefined,
getNotionIcon: (id: string) => {
const info = documentMap.get(id)?.data_source_info
if (info && isLegacyDataSourceInfo(info))
return info.notion_page_icon
return undefined
},
}
}