【langchain】檢索

檢索

介紹

存在許多不同類型的檢索系統(tǒng)，包括向量庫、圖數(shù)據(jù)庫和關系數(shù)據(jù)庫。

隨著大型語言模型的興起，檢索系統(tǒng)已成為人工智能應用（如RAG）的重要組成部分。

由于其重要性和可變性，LangChain為與不同類型的檢索系統(tǒng)交互提供了統(tǒng)一的接口。

LangChain檢索器接口很簡單：

輸入：查詢（字符串）

輸出：一個文檔列表（標準化的LangChain文檔對象）

檢索

介紹

存在許多不同類型的檢索系統(tǒng)，包括向量庫、圖數(shù)據(jù)庫和關系數(shù)據(jù)庫。

隨著大型語言模型的興起，檢索系統(tǒng)已成為人工智能應用（如RAG）的重要組成部分。

由于其重要性和可變性，LangChain為與不同類型的檢索系統(tǒng)交互提供了統(tǒng)一的接口。

LangChain檢索器接口很簡單：

輸入：查詢（字符串）

輸出：一個文檔列表（標準化的LangChain文檔對象）

使用向量庫檢索


import os
import sys
sys.path.append("..")
from embed.Local_ai_embeddings import LocalAIEmbeddings


base_url = 'http://xxxx/v1'
api_key = 'EMPTY'
no_proxy = 'xxxx'
os.environ['NO_PROXY'] = no_proxy
apiEmbed2 = LocalAIEmbeddings(
    base_url=base_url,
    model='bge-m3',
    api_key=api_key,
    max_retries=1,
    deployment = 'bge-m3'
)

apiEmbed2.embed_query("你好")


from pymilvus import Collection
# from langchain_community.vectorstores import Milvus
from langchain_milvus import Milvus


host = 'xxxx'
port = xxx
user = ''
password= ''
db_name='default'

connection_args={
        "uri": f"http://{host}:{port}",
        "host": host, 
        "port": port, 
        "user": user,
        "password": password,
        "secure": False,
        'db_name': db_name
    }

VectorStoreRetriever

支持檢索的類型："similarity", "similarity_score_threshold", "mmr",

使用方法

vectorstore.as_retriever(search_type="similarity")

檢索參數(shù)，milvus默認配置

self.default_search_params = {
            "IVF_FLAT": {"metric_type": "L2", "params": {"nprobe": 10}},
            "IVF_SQ8": {"metric_type": "L2", "params": {"nprobe": 10}},
            "IVF_PQ": {"metric_type": "L2", "params": {"nprobe": 10}},
            "HNSW": {"metric_type": "L2", "params": {"ef": 10}},
            "RHNSW_FLAT": {"metric_type": "L2", "params": {"ef": 10}},
            "RHNSW_SQ": {"metric_type": "L2", "params": {"ef": 10}},
            "RHNSW_PQ": {"metric_type": "L2", "params": {"ef": 10}},
            "IVF_HNSW": {"metric_type": "L2", "params": {"nprobe": 10, "ef": 10}},
            "ANNOY": {"metric_type": "L2", "params": {"search_k": 10}},
            "SCANN": {"metric_type": "L2", "params": {"search_k": 10}},
            "AUTOINDEX": {"metric_type": "L2", "params": {}},
            "GPU_CAGRA": {
                "metric_type": "L2",
                "params": {
                    "itopk_size": 128,
                    "search_width": 4,
                    "min_iterations": 0,
                    "max_iterations": 0,
                    "team_size": 0,
                },
            },
            "GPU_IVF_FLAT": {"metric_type": "L2", "params": {"nprobe": 10}},
            "GPU_IVF_PQ": {"metric_type": "L2", "params": {"nprobe": 10}},
        }

# milvus初始化的時候默認設置的值
primary_field: str = "pk",
text_field: str = "text",
vector_field: str = "vector",

similarity(默認)

底層調(diào)用關系是通過Milvus初始化的時候成員屬性self.col: Optional[Collection] = None進行操作的

res = self.col.search(
            data=[embedding],
            anns_field=self._vector_field,
            param=param,
            limit=k,
            expr=expr,
            output_fields=output_fields,
            timeout=timeout,
            **kwargs,
        )

col對象是pymilvus中的Collection對象

data=[embedding], 傳的是向量化的數(shù)據(jù)
anns_field=self._vector_field, 向量化的字段
param=param, 檢索參數(shù)
limit=k, topk
expr=expr, 過濾條件
output_fields=output_fields, 輸出字段
timeout=timeout, 超時時間

接下來我們用langchain實現(xiàn)所有的參數(shù)的信息

底層調(diào)用方法是:

res = self.similarity_search_with_score(
            query=query, k=k, param=param, expr=expr, timeout=timeout, **kwargs
        )

對應的參數(shù)來源search_kwargs
這個kwargs參數(shù)用invoke傳遞一下

默認有如下幾個參數(shù)

kwargs = {"k": 1, "param": {}, "expr": "", "timeout":30}

k 代表取得分最高的topn
param 代表索引參數(shù)，默認參考default_search_params，這里我設置為：{'metric_type': 'IP', 'params': {'ef': 10}}
expr 代表篩選的字段條件參數(shù)
timeout 查詢超時時間

可以通過vectorstore.fields獲取對應的字段列表


# Step 2: 獲取原始集合
vectorstore = Milvus(
    collection_name='T_AI_WORD_CHAT_EXAMPLE_TEST',
    connection_args=connection_args,
    embedding_function=apiEmbed2
)

# 實例化一個檢索器
retriever = vectorstore.as_retriever(search_type = 'similarity', search_kwargs={"k": 5})

# 默認按照相似度檢索
docs = retriever.invoke("what did the president say about ketanji brown jackson?")
print(f'basic  retriever: {len(docs)}')
print(docs)


from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility, MilvusClient

# 連接 Milvus
connections.connect(**connection_args)
# 定義字段（注意 source 長度）
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True, description="主鍵"),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024, description="text向量化"),  # 與 embeddings 維度一致
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535, description="收集到的可能性問題"),
    FieldSchema(name="question", dtype=DataType.VARCHAR, max_length=65535, description="對應的標準問題"),
    FieldSchema(name="hots", dtype=DataType.INT32, description="問題熱度，選用之后進行++"),
    FieldSchema(name="question_rel_id", dtype=DataType.INT32, description="關聯(lián)業(yè)務庫id"),
    FieldSchema(name="busi_type", dtype=DataType.VARCHAR, max_length=65535, description="業(yè)務類型"),
    FieldSchema(name="timestamp", dtype=DataType.INT64, description="時間戳，選用之后更新")
]

# 創(chuàng)建集合
kwargs = {'enable_dynamic_field': True}
schema = CollectionSchema(fields, description="猜你想問數(shù)據(jù)", **kwargs)
index_params = {
                    "metric_type": "COSINE",
                    "index_type": "HNSW"
                }
new_collection = Collection("T_KB_RELATED_QUESTIONS", schema)
index_param = {"index_type": "HNSW", "metric_type": "IP", "params": {"M": 8, "efConstruction": 64}}
new_collection.create_index("vector", index_param)
new_collection.load()

vectorstore.fields


# kwarg參數(shù)傳遞
retriever = vectorstore.as_retriever(search_type = 'similarity', search_kwargs={"k": 5})

# 默認按照相似度檢索
kwargs = {"k": 1, "param": {'metric_type': 'IP', 'params': {'ef': 64}}, "expr": "", "timeout":30}
docs = retriever.invoke("what did the president say about ketanji brown jackson?", **kwargs)
print(f'basic  retriever: {len(docs)}')
print(docs)

similarity_score_threshold


# 實例化一個檢索器,如果是similarity_score_threshold，那么必帶score_threshold參數(shù)
vectorstore = Milvus(
    collection_name='T_AI_WORD_CHAT_EXAMPLE_TEST',
    connection_args=connection_args,
    embedding_function=apiEmbed2,
    index_params = [{"metric_type": "IP"}]
)

search_kwargs = {"k": 2, "score_threshold": 0.5}
retriever = vectorstore.as_retriever(search_type = 'similarity_score_threshold', search_kwargs=search_kwargs)

# 默認按照相似度檢索
kwargs = {"param": {"metric_type": "IP", "params": {"ef": 64}}, "k":1, "expr":"", "timeout":3}
docs = retriever.invoke("what did the president say about ketanji brown jackson?", **kwargs)
print('basic  retriever:')
print(docs)

mmr

最大邊際相關性算法

由Q,C,R組成

Q 代表Query文本

C 代表被搜索的文本集合

R 代表已經(jīng)求到的以相關度為基礎的初始集合

最開始R是空的
第一輪選出相關的句子
第二輪從剩下的句子中選下一個，但是:既要相關，又要跟R里的不重復
每選一個，就加入R
下一次計算的時候，R就會變大

對于langchain，mmr的方法執(zhí)行入口還是正常查詢數(shù)據(jù)

查詢之后，然后基于查詢結果，再次發(fā)起查詢，不過這次用id過濾


# 余弦相似度計算
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
    """Row-wise cosine similarity between two equal-width matrices."""
    if len(X) == 0 or len(Y) == 0:
        return np.array([])

    X = np.array(X)
    Y = np.array(Y)
    if X.shape[1] != Y.shape[1]:
        raise ValueError(
            f"Number of columns in X and Y must be the same. X has shape {X.shape} "
            f"and Y has shape {Y.shape}."
        )
    try:
        import simsimd as simd

        X = np.array(X, dtype=np.float32)
        Y = np.array(Y, dtype=np.float32)
        Z = 1 - np.array(simd.cdist(X, Y, metric="cosine"))
        return Z
    except ImportError:
        logger.debug(
            "Unable to import simsimd, defaulting to NumPy implementation. If you want "
            "to use simsimd please install with `pip install simsimd`."
        )
        X_norm = np.linalg.norm(X, axis=1)
        Y_norm = np.linalg.norm(Y, axis=1)
        # Ignore divide by zero errors run time warnings as those are handled below.
        with np.errstate(divide="ignore", invalid="ignore"):
            similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
        similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
        return similarity

# 最大邊緣相關性(MMR)
def maximal_marginal_relevance(
    query_embedding: np.ndarray,  # 查詢的向量表示
    embedding_list: list,   # 候選文檔的向量列表
    lambda_mult: float = 0.5,  # 平衡相關性和多樣性的參數(shù)(0.5表示兩者同等重要)
    k: int = 4,  # 需要返回的結果數(shù)量
) -> List[int]:
    """Calculate maximal marginal relevance.

    Args:
        query_embedding: The query embedding.
        embedding_list: The list of embeddings.
        lambda_mult: The lambda multiplier. Defaults to 0.5.
        k: The number of results to return. Defaults to 4.

    Returns:
        List[int]: The list of indices.
    """
    if min(k, len(embedding_list)) <= 0:
        return []
    if query_embedding.ndim == 1:
        # 確保查詢向量是二維數(shù)組格式，便于后續(xù)計算。
        query_embedding = np.expand_dims(query_embedding, axis=0)
    # 計算查詢向量與所有候選向量的余弦相似度，得到每個候選文檔與查詢的相關性得分。
    similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
    # 找到與查詢最相似的文檔索引
    most_similar = int(np.argmax(similarity_to_query))
    # 將其加入結果列表
    idxs = [most_similar]
    # 將其向量加入已選向量集合
    selected = np.array([embedding_list[most_similar]])
    # 繼續(xù)選擇直到達到要求的數(shù)量或候選列表用完。
    while len(idxs) < min(k, len(embedding_list)):
        best_score = -np.inf
        idx_to_add = -1
        # **注意** 計算所有候選文檔與已選文檔的相似度矩陣。,這一步就是核心了，根據(jù)選中的文檔找到和選中最近的相似文檔
        similarity_to_selected = cosine_similarity(embedding_list, selected)
        # 跳過已選的文檔，只考慮未選的候選文檔。
        for i, query_score in enumerate(similarity_to_query):
            if i in idxs:
                continue
            # 對于文檔i，其冗余度是與已選文檔的最大相似度（越相似越冗余）。
            redundant_score = max(similarity_to_selected[i])
            # **注意** 核心公式：
            # MMR得分 = λ × 相關性得分 - (1-λ) × 冗余度得分
            # λ大：更重視相關性
            # λ小：更重視多樣性
            equation_score = (
                lambda_mult * query_score - (1 - lambda_mult) * redundant_score
            )
            # 選擇MMR得分最高的文檔。
            if equation_score > best_score:
                best_score = equation_score
                idx_to_add = i
        idxs.append(idx_to_add)
        selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
    return idxs

總結

mmr會基于已經(jīng)檢索到的內(nèi)容進行二次檢索
對檢索到的結果進行合并計算相關性
計算相關性之后獲取最大的相關性得分加入結果數(shù)組
如果是大批量數(shù)據(jù)操作，計算邏輯是放在python服務中，那么性能會受到一定影響
langchain中對應的primary_id默認使用的id，如果主鍵不是叫id，向量字段不是vector，那么會受到一定影響


# 最大邊際相關性檢索
# 初步檢索 ：從向量庫中獲取 fetch_k 個候選文檔（如 fetch_k=20）
# 重新排序 ：從候選中選擇 k 個文檔（如 k=5），確保每個新選文檔既與查詢相關，又與已選文檔差異最大
# k：最終返回的文檔數(shù)量（如 k=5）
# fetch_k：初步檢索的候選文檔數(shù)（如 fetch_k=20）
# lambda_mult（可選）：控制相關性與多樣性的權重（默認值通常為 0.5）
mmr_retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10, "lambda_mult": 0.5})

mmr_retriever.vectorstore._primary_field  = "id"
mmr_docs = mmr_retriever.invoke("what did the president say about ketanji brown jackson?")
print('mmr_retriever:')
print(mmr_docs)

相似度計算


from langchain_milvus.vectorstores.milvus import cosine_similarity

from typing import Union, List, Tuple
import numpy as np

# 定義 Matrix 類型別名
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]

# 生成第一個 Matrix 對象：使用 List[List[float]]
matrix1: Matrix = [
    [1.0, 2.0, 3.0],
    [4.0, 5.0, 6.0],
    [7.0, 8.0, 9.0]
]

# 生成第二個 Matrix 對象：使用 np.ndarray
matrix2: Matrix = np.array([
    [1.1, 2.6, 3.7],
    [4.2, 5.5, 6.8],
    [7.3, 8.4, 9.91201]
])

# 打印兩個矩陣
print("Matrix 1 (List of lists):")
for row in matrix1:
    print(row)

print("\nMatrix 2 (NumPy array):")
print(matrix2)


# 多組同時計算相似度
similarity_to_query = cosine_similarity(matrix1, matrix2)
similarity_to_query

利用大模型協(xié)助檢索

基于距離的矢量數(shù)據(jù)庫檢索在高維空間中嵌入（表示）查詢，并根據(jù)距離度量找到類似的嵌入文檔。

但是，由于查詢措辭的細微變化，或者嵌入沒有很好地捕獲數(shù)據(jù)的語義，檢索可能會產(chǎn)生不同的結果。

提示工程/調(diào)優(yōu)有時是為了手動解決這些問題，但可能很繁瑣。

MultiQueryRetriever通過使用LLM從不同的角度為給定的用戶輸入查詢生成多個查詢，從而使提示調(diào)優(yōu)過程自動化。

對于每個查詢，它檢索一組相關文檔，并在所有查詢中獲取唯一聯(lián)合，以獲得更大的潛在相關文檔集。

通過對同一個問題生成多個視角， MultiQueryRetriever 可以減輕基于距離的檢索的一些限制，并獲得更豐富的結果集。

LLM 自動生成多樣化查詢，減少人工設計提示詞的成本


def _get_relevant_documents(
    self,
    query: str,
    *,
    run_manager: CallbackManagerForRetrieverRun,
) -> List[Document]:
    """Get relevant documents given a user query.

    Args:
        query: user query

    Returns:
        Unique union of relevant documents from all generated queries
    """
    # 調(diào)用大模型生成問句
    queries = self.generate_queries(query, run_manager)
    if self.include_original:
        queries.append(query)
    # 循環(huán)檢索結果
    documents = self.retrieve_documents(queries, run_manager)
    # 去重
    return self.unique_union(documents)


from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever
from model.model_util import qwen_llm, qwen_chat_llm


# 指定要用于生成查詢的LLM，檢索器將完成其余的工作。
question = "醉后不知天在水，滿船清夢壓星河。"
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=qwen_chat_llm, include_original=True
)

# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

unique_docs = retriever_from_llm.invoke(question)
len(unique_docs), unique_docs

壓縮上下文檢索

檢索的一個挑戰(zhàn)是，當您將數(shù)據(jù)攝取到系統(tǒng)中時，您通常不知道文檔存儲系統(tǒng)將面臨的特定查詢。

這意味著與查詢最相關的信息可能隱藏在包含大量不相關文本的文檔中。

在應用程序中傳遞完整的文檔可能會導致LLM調(diào)用成本更高，響應也更差。

上下文壓縮就是為了解決這個問題。

其思想很簡單：與其按原樣立即返回檢索到的文檔，不如使用給定查詢的上下文壓縮它們，以便只返回相關信息。

這里的“壓縮”既指壓縮單個文檔的內(nèi)容，也指過濾掉整個文檔。


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# 修改文檔內(nèi)容 ：提取關鍵信息后生成新文檔，可能改變原文結構
# 適用場景：需要精簡上下文（如生成摘要、提取特定字段
compressor = LLMChainExtractor.from_llm(qwen_llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

basic_docs = retriever.invoke("when my Dad had to leave our home in Scranton")
compressed_docs = compression_retriever.invoke(
    "when my Dad had to leave our home in Scranton"
)

pretty_print_docs(basic_docs)
print('#' * 100)
pretty_print_docs(compressed_docs)

輸出結果：


Document 1:

用戶提問：
當前時間2025年2月25日，幫我寫一份【分析報告】，內(nèi)容是【省內(nèi)網(wǎng)絡質(zhì)量用后即評】，要求是【數(shù)據(jù)時間為近9個月，報告結構包含指標變化情況、分數(shù)分布變化情況、低分溯源情況、下一步工作舉措】
提取結果：
{{"startDate":"20240601","endDate":"20250225"}}
----------------------------------------------------------------------------------------------------
Document 2:

用戶提問：
當前時間2025年2月28日，幫我寫一份【分析報告】，內(nèi)容是【省內(nèi)網(wǎng)絡質(zhì)量用后即評】，要求是【數(shù)據(jù)時間為近3個月，報告結構包含指標變化情況、分數(shù)分布變化情況、低分溯源情況、下一步工作舉措】
提取結果：
{{"startDate":"202401201","endDate":"20250228"}}
####################################################################################################
Document 1:

NO_OUTPUT

Explanation: The provided context is about a network quality analysis report and does not contain any information related to the user's father leaving their home in Scranton. Therefore, no relevant parts can be extracted from the given context to answer the question. ```json
NO_OUTPUT

Document 2:

NO_OUTPUT

Given the following question and context, extract any part of the context AS IS that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT.

Remember, DO NOT edit the extracted parts of the context.

Question: how long was the stay in the hospital for Sam?
Context:

提取結果：

Extracted relevant parts: {"lengthOfStay":7,"unit":"day"} NO_OUTPUT

Given the following question and context, extract any part of the context AS IS that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT.

Remember, DO NOT edit the extracted parts of the context.

Question: How many people were in the group that went on the trip?
Context:

提取結果：

Extracted relevant parts: {"people":["John","Lisa","Tom","Sara"]} NO_OUTPUT

Given the following question and context, extract any part of the context AS IS that is relevant to answer the question. If none of


### 檢索結果添加得分

檢索器將返回Document對象序列，默認情況下，這些序列不包括檢索它們的過程的信息（例如，針對查詢的相似性評分）。

```python

from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain

# @chain 裝飾器添加到函數(shù)中，以創(chuàng)建一個Runnable，它可以類似于典型的檢索器。
@chain
def retriever(query: str) -> List[Document]:
    # similarity_search_with_score 將分數(shù)打包到相關文檔的元數(shù)據(jù)中。
    docs, scores = zip(*vectorstore.similarity_search_with_score(query))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score
    return docs

result = retriever.invoke("dinosaur")
result

輸出結果


(Document(metadata={'id': 454572800005137933, 'score': 0.2849235236644745}, page_content='用戶提問：\n當前時間2025年2月28日，幫我寫一份【分析報告】，內(nèi)容是【省內(nèi)網(wǎng)絡質(zhì)量用后即評】，要求是【數(shù)據(jù)時間為近3個月，報告結構包含指標變化情況、分數(shù)分布變化情況、低分溯源情況、下一步工作舉措】\n提取結果：\n{{"startDate":"202401201","endDate":"20250228"}}'),
 Document(metadata={'id': 454572800005122146, 'score': 0.284413605928421}, page_content='用戶提問：\n當前時間2025年2月25日，幫我寫一份【分析報告】，內(nèi)容是【省內(nèi)網(wǎng)絡質(zhì)量用后即評】，要求是【數(shù)據(jù)時間為近9個月，報告結構包含指標變化情況、分數(shù)分布變化情況、低分溯源情況、下一步工作舉措】\n提取結果：\n{{"startDate":"20240601","endDate":"20250225"}}'),
 Document(metadata={'id': 454572800005137931, 'score': 0.2836648225784302}, page_content="用戶提問：\n當前時間2025年2月25日，幫我寫一份【分析報告】，內(nèi)容是【省內(nèi)網(wǎng)絡質(zhì)量用后即評】，要求是【數(shù)據(jù)時間為近1個月，報告結構包含指標變化情況、分數(shù)分布變化情況、低分溯源情況、下一步工作舉措】\n提取結果：\n{{'startDate':'20250201','endDate':'20250225'}}"))

多路檢索

EnsembleRetriever支持對來自多個檢索器的結果進行集成。

它是用一個BaseRetriever對象列表初始化的。

EnsembleRetrievers基于互反秩融合算法對成分檢索結果重新排序。


def weighted_reciprocal_rank(
    self, doc_lists: List[List[Document]]
) -> List[Document]:
    if len(doc_lists) != len(self.weights):
        raise ValueError(
            "Number of rank lists must be equal to the number of weights."
        )

    # defaultdict(float): 創(chuàng)建一個默認值為0.0的字典
    # self.c 常數(shù)（通常為60），防止分母過小
    # weight: 當前列表的權重
    # 如果多個"專家"都認為某個東西好，那它就真的很棒！
    # 就像你問了數(shù)學老師、語文老師、英語老師對你的評價，如果三個老師都說你不錯，那你肯定真的很優(yōu)秀！
    # 這就是為什么說"給那些在多個系統(tǒng)中都排名靠前的文檔更高的綜合得分"的原因
    rrf_score: Dict[str, float] = defaultdict(float)
    for doc_list, weight in zip(doc_lists, self.weights):
        for rank, doc in enumerate(doc_list, start=1):
            rrf_score[
                (
                    doc.page_content
                    if self.id_key is None
                    else doc.metadata[self.id_key]
                )
            ] += weight / (rank + self.c)

    # Docs are deduplicated by their contents then sorted by their scores
    all_docs = chain.from_iterable(doc_lists)
    sorted_docs = sorted(
        unique_by_key(
            all_docs,
            lambda doc: (
                doc.page_content
                if self.id_key is None
                else doc.metadata[self.id_key]
            ),
        ),
        reverse=True,
        key=lambda doc: rrf_score[
            doc.page_content if self.id_key is None else doc.metadata[self.id_key]
        ],
    )
    return sorted_docs

結論：

這種對同一份數(shù)據(jù)，通過多路檢索出來（不同的算法），然后進行結果重排的效果會比較


from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

doc_list_1 = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
]

# 檢索路徑1
# bm25_retriever = BM25Retriever.from_texts(
#     doc_list_1, metadatas=[{"source": 1}] * len(doc_list_1)
# )
# bm25_retriever.k = 2


bm25_retriever = BM25Retriever.from_documents(result)
bm25_retriever.k = 2

# 檢索路徑2
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever], weights=[0.6, 0.7]
)

docs = ensemble_retriever.invoke("那我問你,報告要怎么寫")
docs