dify/api/core/rag/rerank/weight_rerank.py
lyzno1 5bbf685035
feat: fix i18n missing keys and merge upstream/main (#24615)
Signed-off-by: -LAN- <laipz8200@outlook.com>
Signed-off-by: kenwoodjw <blackxin55+@gmail.com>
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: zhanluxianshen <zhanluxianshen@163.com>
Co-authored-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: GuanMu <ballmanjq@gmail.com>
Co-authored-by: Davide Delbianco <davide.delbianco@outlook.com>
Co-authored-by: NeatGuyCoding <15627489+NeatGuyCoding@users.noreply.github.com>
Co-authored-by: kenwoodjw <blackxin55+@gmail.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: Yongtao Huang <99629139+hyongtao-db@users.noreply.github.com>
Co-authored-by: Qiang Lee <18018968632@163.com>
Co-authored-by: 李强04 <liqiang04@gaotu.cn>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Asuka Minato <i@asukaminato.eu.org>
Co-authored-by: Matri Qi <matrixdom@126.com>
Co-authored-by: huayaoyue6 <huayaoyue@163.com>
Co-authored-by: Bowen Liang <liangbowen@gf.com.cn>
Co-authored-by: znn <jubinkumarsoni@gmail.com>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: yihong <zouzou0208@gmail.com>
Co-authored-by: Muke Wang <shaodwaaron@gmail.com>
Co-authored-by: wangmuke <wangmuke@kingsware.cn>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: 非法操作 <hjlarry@163.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Eric Guo <eric.guocz@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: jiangbo721 <jiangbo721@163.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: hjlarry <25834719+hjlarry@users.noreply.github.com>
Co-authored-by: lxsummer <35754229+lxjustdoit@users.noreply.github.com>
Co-authored-by: 湛露先生 <zhanluxianshen@163.com>
Co-authored-by: Guangdong Liu <liugddx@gmail.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Yessenia-d <yessenia.contact@gmail.com>
Co-authored-by: huangzhuo1949 <167434202+huangzhuo1949@users.noreply.github.com>
Co-authored-by: huangzhuo <huangzhuo1@xiaomi.com>
Co-authored-by: 17hz <0x149527@gmail.com>
Co-authored-by: Amy <1530140574@qq.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Nite Knite <nkCoding@gmail.com>
Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com>
Co-authored-by: Petrus Han <petrus.hanks@gmail.com>
Co-authored-by: iamjoel <2120155+iamjoel@users.noreply.github.com>
Co-authored-by: Kalo Chin <frog.beepers.0n@icloud.com>
Co-authored-by: Ujjwal Maurya <ujjwalsbx@gmail.com>
Co-authored-by: Maries <xh001x@hotmail.com>
2025-08-27 15:07:28 +08:00

193 lines
7.0 KiB
Python

import math
from collections import Counter
from typing import Optional
import numpy as np
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
from core.rag.embedding.cached_embedding import CacheEmbedding
from core.rag.models.document import Document
from core.rag.rerank.entity.weight import VectorSetting, Weights
from core.rag.rerank.rerank_base import BaseRerankRunner
class WeightRerankRunner(BaseRerankRunner):
def __init__(self, tenant_id: str, weights: Weights) -> None:
self.tenant_id = tenant_id
self.weights = weights
def run(
self,
query: str,
documents: list[Document],
score_threshold: Optional[float] = None,
top_n: Optional[int] = None,
user: Optional[str] = None,
) -> list[Document]:
"""
Run rerank model
:param query: search query
:param documents: documents for reranking
:param score_threshold: score threshold
:param top_n: top n
:param user: unique user id if needed
:return:
"""
unique_documents = []
doc_ids = set()
for document in documents:
if (
document.provider == "dify"
and document.metadata is not None
and document.metadata["doc_id"] not in doc_ids
):
doc_ids.add(document.metadata["doc_id"])
unique_documents.append(document)
else:
if document not in unique_documents:
unique_documents.append(document)
documents = unique_documents
query_scores = self._calculate_keyword_score(query, documents)
query_vector_scores = self._calculate_cosine(self.tenant_id, query, documents, self.weights.vector_setting)
rerank_documents = []
for document, query_score, query_vector_score in zip(documents, query_scores, query_vector_scores):
score = (
self.weights.vector_setting.vector_weight * query_vector_score
+ self.weights.keyword_setting.keyword_weight * query_score
)
if score_threshold and score < score_threshold:
continue
if document.metadata is not None:
document.metadata["score"] = score
rerank_documents.append(document)
rerank_documents.sort(key=lambda x: x.metadata["score"] if x.metadata else 0, reverse=True)
return rerank_documents[:top_n] if top_n else rerank_documents
def _calculate_keyword_score(self, query: str, documents: list[Document]) -> list[float]:
"""
Calculate BM25 scores
:param query: search query
:param documents: documents for reranking
:return:
"""
keyword_table_handler = JiebaKeywordTableHandler()
query_keywords = keyword_table_handler.extract_keywords(query, None)
documents_keywords = []
for document in documents:
# get the document keywords
document_keywords = keyword_table_handler.extract_keywords(document.page_content, None)
if document.metadata is not None:
document.metadata["keywords"] = document_keywords
documents_keywords.append(document_keywords)
# Counter query keywords(TF)
query_keyword_counts = Counter(query_keywords)
# total documents
total_documents = len(documents)
# calculate all documents' keywords IDF
all_keywords = set()
for document_keywords in documents_keywords:
all_keywords.update(document_keywords)
keyword_idf = {}
for keyword in all_keywords:
# calculate include query keywords' documents
doc_count_containing_keyword = sum(1 for doc_keywords in documents_keywords if keyword in doc_keywords)
# IDF
keyword_idf[keyword] = math.log((1 + total_documents) / (1 + doc_count_containing_keyword)) + 1
query_tfidf = {}
for keyword, count in query_keyword_counts.items():
tf = count
idf = keyword_idf.get(keyword, 0)
query_tfidf[keyword] = tf * idf
# calculate all documents' TF-IDF
documents_tfidf = []
for document_keywords in documents_keywords:
document_keyword_counts = Counter(document_keywords)
document_tfidf = {}
for keyword, count in document_keyword_counts.items():
tf = count
idf = keyword_idf.get(keyword, 0)
document_tfidf[keyword] = tf * idf
documents_tfidf.append(document_tfidf)
def cosine_similarity(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum(vec1[x] * vec2[x] for x in intersection)
sum1 = sum(vec1[x] ** 2 for x in vec1)
sum2 = sum(vec2[x] ** 2 for x in vec2)
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
similarities = []
for document_tfidf in documents_tfidf:
similarity = cosine_similarity(query_tfidf, document_tfidf)
similarities.append(similarity)
# for idx, similarity in enumerate(similarities):
# print(f"Document {idx + 1} similarity: {similarity}")
return similarities
def _calculate_cosine(
self, tenant_id: str, query: str, documents: list[Document], vector_setting: VectorSetting
) -> list[float]:
"""
Calculate Cosine scores
:param query: search query
:param documents: documents for reranking
:return:
"""
query_vector_scores = []
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=tenant_id,
provider=vector_setting.embedding_provider_name,
model_type=ModelType.TEXT_EMBEDDING,
model=vector_setting.embedding_model_name,
)
cache_embedding = CacheEmbedding(embedding_model)
query_vector = cache_embedding.embed_query(query)
for document in documents:
# calculate cosine similarity
if document.metadata and "score" in document.metadata:
query_vector_scores.append(document.metadata["score"])
else:
# transform to NumPy
vec1 = np.array(query_vector)
vec2 = np.array(document.vector)
# calculate dot product
dot_product = np.dot(vec1, vec2)
# calculate norm
norm_vec1 = np.linalg.norm(vec1)
norm_vec2 = np.linalg.norm(vec2)
# calculate cosine similarity
cosine_sim = dot_product / (norm_vec1 * norm_vec2)
query_vector_scores.append(cosine_sim)
return query_vector_scores