Skip to main content

SAP HANA Cloud Vector Engine

SAP HANA Cloud Vector Engine 是一个完全集成到 SAP HANA Cloud 数据库中的向量存储。

您需要使用 pip install -qU langchain-community 安装 langchain-community 以使用此集成。

设置

安装 HANA 数据库驱动程序。

# Pip install necessary package
%pip install --upgrade --quiet hdbcli

对于 OpenAIEmbeddings,我们使用来自环境的 OpenAI API 密钥。

import os
# Use OPENAI_API_KEY env variable
# os.environ["OPENAI_API_KEY"] = "Your OpenAI API key"

创建到 HANA Cloud 实例的数据库连接。

from hdbcli import dbapi

# Use connection settings from the environment
connection = dbapi.connect(
address=os.environ.get("HANA_DB_ADDRESS"),
port=os.environ.get("HANA_DB_PORT"),
user=os.environ.get("HANA_DB_USER"),
password=os.environ.get("HANA_DB_PASSWORD"),
autocommit=True,
sslValidateCertificate=False,
)

示例

加载示例文档 "state_of_the_union.txt" 并从中创建块。

from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.hanavector import HanaDB
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

text_documents = TextLoader("../../how_to/state_of_the_union.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
text_chunks = text_splitter.split_documents(text_documents)
print(f"文档块的数量: {len(text_chunks)}")

embeddings = OpenAIEmbeddings()

为 HANA 数据库创建一个 LangChain VectorStore 接口,并指定用于访问向量嵌入的表(集合)。

db = HanaDB(
embedding=embeddings, connection=connection, table_name="STATE_OF_THE_UNION"
)

将加载的文档块添加到表中。对于此示例,我们删除表中可能存在的任何先前内容。

# 删除表中已存在的文档
db.delete(filter={})

# 添加加载的文档块
db.add_documents(text_chunks)

执行查询以获取在上一步中添加的两个最佳匹配文档块。默认情况下,搜索使用 "余弦相似度"。

query = "总统对 Ketanji Brown Jackson 说了什么"
docs = db.similarity_search(query, k=2)

for doc in docs:
print("-" * 80)
print(doc.page_content)

使用 "欧几里得距离" 查询相同的内容。结果应与 "余弦相似度" 相同。

from langchain_community.vectorstores.utils import DistanceStrategy

db = HanaDB(
embedding=embeddings,
connection=connection,
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name="STATE_OF_THE_UNION",
)

query = "总统对 Ketanji Brown Jackson 说了什么"
docs = db.similarity_search(query, k=2)
for doc in docs:
print("-" * 80)
print(doc.page_content)

最大边际相关性搜索 (MMR)

最大边际相关性 在优化查询相似性和所选文档的多样性之间进行平衡。将从数据库中检索前 20 (fetch_k) 项。然后,MMR 算法将找到最佳的 2 (k) 个匹配项。

docs = db.max_marginal_relevance_search(query, k=2, fetch_k=20)
for doc in docs:
print("-" * 80)
print(doc.page_content)

基本向量存储操作

db = HanaDB(
connection=connection, embedding=embeddings, table_name="LANGCHAIN_DEMO_BASIC"
)

# 从表中删除已经存在的文档
db.delete(filter={})

我们可以将简单的文本文档添加到现有表中。

docs = [Document(page_content="Some text"), Document(page_content="Other docs")]
db.add_documents(docs)

添加带有元数据的文档。

docs = [
Document(
page_content="foo",
metadata={"start": 100, "end": 150, "doc_name": "foo.txt", "quality": "bad"},
),
Document(
page_content="bar",
metadata={"start": 200, "end": 250, "doc_name": "bar.txt", "quality": "good"},
),
]
db.add_documents(docs)

查询具有特定元数据的文档。

docs = db.similarity_search("foobar", k=2, filter={"quality": "bad"})
# 通过过滤“quality”==“bad”,应该只返回一个文档
for doc in docs:
print("-" * 80)
print(doc.page_content)
print(doc.metadata)

删除具有特定元数据的文档。

db.delete(filter={"quality": "bad"})

# 现在使用相同过滤条件的相似性搜索将返回零结果
docs = db.similarity_search("foobar", k=2, filter={"quality": "bad"})
print(len(docs))

高级过滤

除了基本的基于值的过滤功能外,还可以使用更高级的过滤。 下表显示了可用的过滤运算符。

运算符语义
$eq等于 (==)
$ne不等于 (!=)
$lt小于 (<)
$lte小于或等于 (<=)
$gt大于 (>)
$gte大于或等于 (>=)
$in包含在给定值的集合中 (in)
$nin不包含在给定值的集合中 (not in)
$between在两个边界值的范围内
$like基于 SQL 中 "LIKE" 语义的文本相等 (使用 "%" 作为通配符)
$and逻辑 "与",支持 2 个或更多操作数
$or逻辑 "或",支持 2 个或更多操作数
# Prepare some test documents
docs = [
Document(
page_content="First",
metadata={"name": "adam", "is_active": True, "id": 1, "height": 10.0},
),
Document(
page_content="Second",
metadata={"name": "bob", "is_active": False, "id": 2, "height": 5.7},
),
Document(
page_content="Third",
metadata={"name": "jane", "is_active": True, "id": 3, "height": 2.4},
),
]

db = HanaDB(
connection=connection,
embedding=embeddings,
table_name="LANGCHAIN_DEMO_ADVANCED_FILTER",
)

# Delete already existing documents from the table
db.delete(filter={})
db.add_documents(docs)


# Helper function for printing filter results
def print_filter_result(result):
if len(result) == 0:
print("<empty result>")
for doc in result:
print(doc.metadata)

使用 $ne$gt$gte$lt$lte 进行过滤

advanced_filter = {"id": {"$ne": 1}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"id": {"$gt": 1}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"id": {"$gte": 1}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"id": {"$lt": 1}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"id": {"$lte": 1}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

使用 $between$in$nin 进行过滤

advanced_filter = {"id": {"$between": (1, 2)}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"name": {"$in": ["adam", "bob"]}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"name": {"$nin": ["adam", "bob"]}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

使用 $like 进行文本过滤

advanced_filter = {"name": {"$like": "a%"}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"name": {"$like": "%a%"}}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

使用 $and$or 进行组合过滤

advanced_filter = {"$or": [{"id": 1}, {"name": "bob"}]}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"$and": [{"id": 1}, {"id": 2}]}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

advanced_filter = {"$or": [{"id": 1}, {"id": 2}, {"id": 3}]}
print(f"Filter: {advanced_filter}")
print_filter_result(db.similarity_search("just testing", k=5, filter=advanced_filter))

使用 VectorStore 作为链中的检索器进行检索增强生成 (RAG)

from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI

# 通过新表访问向量数据库
db = HanaDB(
connection=connection,
embedding=embeddings,
table_name="LANGCHAIN_DEMO_RETRIEVAL_CHAIN",
)

# 删除表中已存在的条目
db.delete(filter={})

# 从“国情咨文”文件中添加加载的文档片段
db.add_documents(text_chunks)

# 创建向量存储的检索器实例
retriever = db.as_retriever()

定义提示。

from langchain_core.prompts import PromptTemplate

prompt_template = """
您是国情咨文主题的专家。您将获得多个与您必须回答的提示相关的上下文项目。
使用以下上下文内容回答最后的问题。

'''
{context}
'''

问题: {question}
"""

PROMPT = PromptTemplate(
template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}

创建 ConversationalRetrievalChain,它处理聊天历史和检索类似的文档片段以添加到提示中。

from langchain.chains import ConversationalRetrievalChain

llm = ChatOpenAI(model="gpt-3.5-turbo")
memory = ConversationBufferMemory(
memory_key="chat_history", output_key="answer", return_messages=True
)
qa_chain = ConversationalRetrievalChain.from_llm(
llm,
db.as_retriever(search_kwargs={"k": 5}),
return_source_documents=True,
memory=memory,
verbose=False,
combine_docs_chain_kwargs={"prompt": PROMPT},
)

提出第一个问题(并验证使用了多少文本片段)。

question = "关于墨西哥和危地马拉怎么样?"

result = qa_chain.invoke({"question": question})
print("来自 LLM 的回答:")
print("================")
print(result["answer"])

source_docs = result["source_documents"]
print("================")
print(f"使用的源文档片段数量: {len(source_docs)}")

详细检查链中使用的片段。检查排名最高的片段是否包含关于“墨西哥和危地马拉”的信息,如问题中提到的。

for doc in source_docs:
print("-" * 80)
print(doc.page_content)
print(doc.metadata)

在同一对话链上提出另一个问题。答案应与先前给出的答案相关。

question = "其他国家怎么样?"

result = qa_chain.invoke({"question": question})
print("来自 LLM 的回答:")
print("================")
print(result["answer"])

标准表与带有向量数据的“自定义”表

默认情况下,嵌入的表创建了3列:

  • 一列 VEC_TEXT,包含文档的文本
  • 一列 VEC_META,包含文档的元数据
  • 一列 VEC_VECTOR,包含文档文本的嵌入向量
# Access the vector DB with a new table
db = HanaDB(
connection=connection, embedding=embeddings, table_name="LANGCHAIN_DEMO_NEW_TABLE"
)

# Delete already existing entries from the table
db.delete(filter={})

# Add a simple document with some metadata
docs = [
Document(
page_content="A simple document",
metadata={"start": 100, "end": 150, "doc_name": "simple.txt"},
)
]
db.add_documents(docs)

显示表“LANGCHAIN_DEMO_NEW_TABLE”中的列

cur = connection.cursor()
cur.execute(
"SELECT COLUMN_NAME, DATA_TYPE_NAME FROM SYS.TABLE_COLUMNS WHERE SCHEMA_NAME = CURRENT_SCHEMA AND TABLE_NAME = 'LANGCHAIN_DEMO_NEW_TABLE'"
)
rows = cur.fetchall()
for row in rows:
print(row)
cur.close()

显示插入文档在三列中的值

cur = connection.cursor()
cur.execute(
"SELECT VEC_TEXT, VEC_META, TO_NVARCHAR(VEC_VECTOR) FROM LANGCHAIN_DEMO_NEW_TABLE LIMIT 1"
)
rows = cur.fetchall()
print(rows[0][0]) # The text
print(rows[0][1]) # The metadata
print(rows[0][2]) # The vector
cur.close()

自定义表必须至少有三列,与标准表的语义匹配

  • 一列类型为 NCLOBNVARCHAR 的嵌入文本/上下文
  • 一列类型为 NCLOBNVARCHAR 的元数据
  • 一列类型为 REAL_VECTOR 的嵌入向量

该表可以包含其他列。当新文档插入表中时,这些附加列必须允许 NULL 值。

# Create a new table "MY_OWN_TABLE" with three "standard" columns and one additional column
my_own_table_name = "MY_OWN_TABLE"
cur = connection.cursor()
cur.execute(
(
f"CREATE TABLE {my_own_table_name} ("
"SOME_OTHER_COLUMN NVARCHAR(42), "
"MY_TEXT NVARCHAR(2048), "
"MY_METADATA NVARCHAR(1024), "
"MY_VECTOR REAL_VECTOR )"
)
)

# Create a HanaDB instance with the own table
db = HanaDB(
connection=connection,
embedding=embeddings,
table_name=my_own_table_name,
content_column="MY_TEXT",
metadata_column="MY_METADATA",
vector_column="MY_VECTOR",
)

# Add a simple document with some metadata
docs = [
Document(
page_content="Some other text",
metadata={"start": 400, "end": 450, "doc_name": "other.txt"},
)
]
db.add_documents(docs)

# Check if data has been inserted into our own table
cur.execute(f"SELECT * FROM {my_own_table_name} LIMIT 1")
rows = cur.fetchall()
print(rows[0][0]) # Value of column "SOME_OTHER_DATA". Should be NULL/None
print(rows[0][1]) # The text
print(rows[0][2]) # The metadata
print(rows[0][3]) # The vector

cur.close()

添加另一个文档并在自定义表上执行相似性搜索。

docs = [
Document(
page_content="Some more text",
metadata={"start": 800, "end": 950, "doc_name": "more.txt"},
)
]
db.add_documents(docs)

query = "What's up?"
docs = db.similarity_search(query, k=2)
for doc in docs:
print("-" * 80)
print(doc.page_content)

自定义列的过滤性能优化

为了允许灵活的元数据值,所有元数据默认存储为 JSON 格式在元数据列中。如果已知某些使用的元数据键和类型,可以通过创建目标表并将键名作为列名,将它们存储在额外的列中,并通过 specific_metadata_columns 列表将它们传递给 HanaDB 构造函数。在插入时,匹配这些值的元数据键会被复制到特殊列中。过滤器使用特殊列而不是元数据 JSON 列来获取 specific_metadata_columns 列表中的键。

# Create a new table "PERFORMANT_CUSTOMTEXT_FILTER" with three "standard" columns and one additional column
my_own_table_name = "PERFORMANT_CUSTOMTEXT_FILTER"
cur = connection.cursor()
cur.execute(
(
f"CREATE TABLE {my_own_table_name} ("
"CUSTOMTEXT NVARCHAR(500), "
"MY_TEXT NVARCHAR(2048), "
"MY_METADATA NVARCHAR(1024), "
"MY_VECTOR REAL_VECTOR )"
)
)

# Create a HanaDB instance with the own table
db = HanaDB(
connection=connection,
embedding=embeddings,
table_name=my_own_table_name,
content_column="MY_TEXT",
metadata_column="MY_METADATA",
vector_column="MY_VECTOR",
specific_metadata_columns=["CUSTOMTEXT"],
)

# Add a simple document with some metadata
docs = [
Document(
page_content="Some other text",
metadata={
"start": 400,
"end": 450,
"doc_name": "other.txt",
"CUSTOMTEXT": "Filters on this value are very performant",
},
)
]
db.add_documents(docs)

# Check if data has been inserted into our own table
cur.execute(f"SELECT * FROM {my_own_table_name} LIMIT 1")
rows = cur.fetchall()
print(
rows[0][0]
) # Value of column "CUSTOMTEXT". Should be "Filters on this value are very performant"
print(rows[0][1]) # The text
print(
rows[0][2]
) # The metadata without the "CUSTOMTEXT" data, as this is extracted into a sperate column
print(rows[0][3]) # The vector

cur.close()

特殊列对 langchain 接口的其余部分是完全透明的。一切都像以前一样工作,只是性能更高。

docs = [
Document(
page_content="Some more text",
metadata={
"start": 800,
"end": 950,
"doc_name": "more.txt",
"CUSTOMTEXT": "Another customtext value",
},
)
]
db.add_documents(docs)

advanced_filter = {"CUSTOMTEXT": {"$like": "%value%"}}
query = "What's up?"
docs = db.similarity_search(query, k=2, filter=advanced_filter)
for doc in docs:
print("-" * 80)
print(doc.page_content)

相关


此页面是否有帮助?


您还可以留下详细的反馈 在 GitHub 上