
前面介绍了向量化的过程。当然在RAG调用中,不会直接使用上面的方法进行向量化,而是把第一步定义的向量化模型包装起来给后面的LLM使用。同时会把向量化后的结果存储到向量数据库里,提问的时候使用向量化查询来匹配,下面看看这个过程的例子:
llm, err := openai.New(
openai.WithEmbeddingModel("text-embedding-ada-002"),
openai.WithBaseURL(Openai.Host),
openai.WithToken(Openai.Key),
)
if err != nil {
log.Fatal(err)
}
// 创建embedder
openAiEmbedder, err := embeddings.NewEmbedder(llm)
if err != nil {
log.Fatal(err)
}
// 基于redis存储向量
redisStore, err := redisvector.New(ctx,
redisvector.WithConnectionURL(Redis.Url),
redisvector.WithIndexName("test_vector_idx", true),
redisvector.WithEmbedder(openAiEmbedder),
)
if err != nil {
log.Fatalln(err)
}
// 插入测试数据
data := []schema.Document{
{PageContent: "狸花猫", Metadata: nil},
{PageContent: "金渐层猫", Metadata: nil},
{PageContent: "松狮犬", Metadata: nil},
}
_, err = redisStore.AddDocuments(ctx, data)
if err != nil {
log.Fatalln(err)
}
docs, err := redisStore.SimilaritySearch(ctx, "猫", 3,
vectorstores.WithScoreThreshold(0.5),
)
fmt.Println(docs)
// 将vector检索接入chains中
result, err := chains.Run(
ctx,
chains.NewRetrievalQAFromLLM(
llm,
vectorstores.ToRetriever(redisStore, 3, vectorstores.WithScoreThreshold(0.8)),
),
"有哪些猫?",
)
fmt.Println(result)封装为enbedder的过程如下:
func NewEmbedder(client EmbedderClient, opts ...Option) (*EmbedderImpl, error) {
e := &EmbedderImpl{
client: client,
StripNewLines: defaultStripNewLines,
BatchSize: defaultBatchSize,
}
for _, opt := range opts {
opt(e)
}
return e, nil
}type EmbedderImpl struct {
client EmbedderClient
StripNewLines bool
BatchSize int
}这里的向量数据库选用了redis的向量数据库插件,当然也可以使用其他向量数据库
/ New creates a new Store with options.
func New(ctx context.Context, opts ...Option) (*Store, error) {
var s *Store
var err error
s, err = applyClientOptions(opts...)
if err != nil {
return nil, err
}
client, err := NewRueidisClient(s.redisURL)
if err != nil {
return nil, err
}
s.client = client
if !s.client.CheckIndexExists(ctx, s.indexName) {
if !s.createIndexIfNotExists {
return nil, ErrNotExistedIndex
} else if s.indexSchema != nil {
// create index with input schema
if err := s.client.CreateIndexIfNotExists(ctx, s.indexName, s.indexSchema); err != nil {
return nil, err
}
}
}
return s, nil
}先初始化了一个redis客户端使用的是包"github.com/redis/rueidis",先判断索引是否存在
func (c RueidisClient) CheckIndexExists(ctx context.Context, index string) bool {
if index == "" {
return false
}
return c.client.Do(ctx, c.client.B().FtInfo().Index(index).Build()).Error() == nil
}不存在就进行创建
func (c RueidisClient) CreateIndexIfNotExists(ctx context.Context, index string, schema *IndexSchema) error {
if index == "" {
return ErrEmptyIndexName
}
if c.CheckIndexExists(ctx, index) {
return nil
}
redisIndex := NewIndex(index, []string{getPrefix(index)}, HASHIndexType, *schema)
createIndexCmd, err := redisIndex.AsCommand()
if err != nil {
return err
}
return c.client.Do(ctx, c.client.B().Arbitrary(createIndexCmd[0]).Keys(createIndexCmd[1]).Args(createIndexCmd[2:]...).Build()).Error()
}创建完向量后,我们把文案向量化后存入向量数据库
func (s *Store) AddDocuments(ctx context.Context, docs []schema.Document, _ ...vectorstores.Option) ([]string, error) {
err := s.appendDocumentsWithVectors(ctx, docs)
if err != nil {
return nil, err
}
indexSchema, err := generateSchemaWithMetadata(docs[0].Metadata)
if err != nil {
return nil, err
}
if s.indexSchema == nil {
s.indexSchema = indexSchema
}
if s.createIndexIfNotExists && !s.client.CheckIndexExists(ctx, s.indexName) {
if err := s.client.CreateIndexIfNotExists(ctx, s.indexName, indexSchema); err != nil {
return nil, err
}
}
docIDs, err := s.client.AddDocsWithHash(ctx, getPrefix(s.indexName), docs)
if err != nil {
return nil, err
}
return docIDs, nil
}先对输入的文本列表进行向量化
func (s Store) appendDocumentsWithVectors(ctx context.Context, docs []schema.Document) error {
if len(docs) == 0 {
return nil
}
texts := make([]string, 0, len(docs))
for _, doc := range docs {
texts = append(texts, doc.PageContent)
}
vectors, err := s.embedder.EmbedDocuments(ctx, texts)然后转化为向量数据库的schema
func generateSchemaWithMetadata(data map[string]any) (*IndexSchema, error) {
defaultVectorField := VectorField{
Name: defaultContentVectorFieldKey,
Algorithm: FlatVectorAlgorithm,
Dims: 1536,
Datatype: FLOAT32VectorDataType,
DistanceMetric: CosineDistanceMetric,
}
schema := IndexSchema{}
for key, value := range data {
// nolint:nestif
// content_vector
if key == defaultContentVectorFieldKey {
field := defaultVectorField
if _value, ok := value.([]float32); ok {
field.Dims = len(_value)
schema.Vector = append(schema.Vector, field)
} else if _value, ok := value.([]float64); ok {最后调用redis 客户端的命令DoMulti来进行存储
func (c RueidisClient) AddDocsWithHash(ctx context.Context, prefix string, docs []schema.Document) ([]string, error) {
cmds := make([]rueidis.Completed, 0, len(docs))
docIDs := make([]string, 0, len(docs))
errs := make([]error, 0, len(docs))
for _, doc := range docs {
docID, cmd := c.generateHSetCMD(prefix, doc)
cmds = append(cmds, cmd)
docIDs = append(docIDs, docID)
}
result := c.client.DoMulti(ctx, cmds...)
for _, res := range result {
if res.Error() != nil {
errs = append(errs, res.Error())
}
}
return docIDs, errors.Join(errs...)
}存储完毕后,我们看看向量化查询的过程
func (s *Store) SimilaritySearch(ctx context.Context, query string, numDocuments int, options ...vectorstores.Option) ([]schema.Document, error) {
opts := s.getOptions(options...)
scoreThreshold, err := s.getScoreThreshold(opts)
if err != nil {
return nil, err
}
filter, err := s.getFilters(opts)
if err != nil {
return nil, err
}
embedder := s.embedder
if opts.Embedder != nil {
embedder = opts.Embedder
}
embedderData, err := embedder.EmbedQuery(ctx, query)
if err != nil {
return nil, err
}
searchOpts := []SearchOption{WithScoreThreshold(scoreThreshold), WithOffsetLimit(0, numDocuments), WithPreFilters(filter)}
if s.indexSchema != nil {
searchOpts = append(searchOpts, WithReturns(maps.Keys(s.indexSchema.MetadataKeys())))
}
search, err := NewIndexVectorSearch(
s.indexName,
embedderData,
searchOpts...,
)
if err != nil {
return nil, err
}
_, docs, err := s.client.Search(ctx, *search)
if err != nil {
return nil, err
}
return docs, nil
}先对输入的文本进行向量化,然后组装查询,调用接口Search来进行匹配
func (c RueidisClient) Search(ctx context.Context, search IndexVectorSearch) (int64, []schema.Document, error) {
cmds := search.AsCommand()
// fmt.Println(strings.Join(cmds, " "))
total, docs, err := c.client.Do(ctx, c.client.B().Arbitrary(cmds[0]).Keys(cmds[1]).Args(cmds[2:]...).Build()).AsFtSearch()
if err != nil {
return 0, nil, err
}
return total, convertFTSearchResIntoDocSchema(docs), nil
}本文分享自 golang算法架构leetcode技术php 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!