Ver código fonte

Feat: Add pg_bigm for keyword search in pgvector (#13876)

Signed-off-by: Yuichiro Utsumi <utsumi.yuichiro@fujitsu.com>
Yuichiro Utsumi 1 mês atrás
pai
commit
5f9d236d22

+ 5 - 0
api/configs/middleware/vdb/pgvector_config.py

@@ -43,3 +43,8 @@ class PGVectorConfig(BaseSettings):
         description="Max connection of the PostgreSQL database",
         default=5,
     )
+
+    PGVECTOR_PG_BIGM: bool = Field(
+        description="Whether to use pg_bigm module for full text search",
+        default=False,
+    )

+ 32 - 9
api/core/rag/datasource/vdb/pgvector/pgvector.py

@@ -25,6 +25,7 @@ class PGVectorConfig(BaseModel):
     database: str
     min_connection: int
     max_connection: int
+    pg_bigm: bool = False
 
     @model_validator(mode="before")
     @classmethod
@@ -62,12 +63,18 @@ CREATE INDEX IF NOT EXISTS embedding_cosine_v1_idx ON {table_name}
 USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64);
 """
 
+SQL_CREATE_INDEX_PG_BIGM = """
+CREATE INDEX IF NOT EXISTS bigm_idx ON {table_name}
+USING gin (text gin_bigm_ops);
+"""
+
 
 class PGVector(BaseVector):
     def __init__(self, collection_name: str, config: PGVectorConfig):
         super().__init__(collection_name)
         self.pool = self._create_connection_pool(config)
         self.table_name = f"embedding_{collection_name}"
+        self.pg_bigm = config.pg_bigm
 
     def get_type(self) -> str:
         return VectorType.PGVECTOR
@@ -176,15 +183,27 @@ class PGVector(BaseVector):
         top_k = kwargs.get("top_k", 5)
 
         with self._get_cursor() as cur:
-            cur.execute(
-                f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score
-                FROM {self.table_name}
-                WHERE to_tsvector(text) @@ plainto_tsquery(%s)
-                ORDER BY score DESC
-                LIMIT {top_k}""",
-                # f"'{query}'" is required in order to account for whitespace in query
-                (f"'{query}'", f"'{query}'"),
-            )
+            if self.pg_bigm:
+                cur.execute("SET pg_bigm.similarity_limit TO 0.000001")
+                cur.execute(
+                    f"""SELECT meta, text, bigm_similarity(unistr(%s), coalesce(text, '')) AS score
+                    FROM {self.table_name}
+                    WHERE text =%% unistr(%s)
+                    ORDER BY score DESC
+                    LIMIT {top_k}""",
+                    # f"'{query}'" is required in order to account for whitespace in query
+                    (f"'{query}'", f"'{query}'"),
+                )
+            else:
+                cur.execute(
+                    f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score
+                    FROM {self.table_name}
+                    WHERE to_tsvector(text) @@ plainto_tsquery(%s)
+                    ORDER BY score DESC
+                    LIMIT {top_k}""",
+                    # f"'{query}'" is required in order to account for whitespace in query
+                    (f"'{query}'", f"'{query}'"),
+                )
 
             docs = []
 
@@ -214,6 +233,9 @@ class PGVector(BaseVector):
                 # ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing
                 if dimension <= 2000:
                     cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name))
+                if self.pg_bigm:
+                    cur.execute("CREATE EXTENSION IF NOT EXISTS pg_bigm")
+                    cur.execute(SQL_CREATE_INDEX_PG_BIGM.format(table_name=self.table_name))
             redis_client.set(collection_exist_cache_key, 1, ex=3600)
 
 
@@ -237,5 +259,6 @@ class PGVectorFactory(AbstractVectorFactory):
                 database=dify_config.PGVECTOR_DATABASE or "postgres",
                 min_connection=dify_config.PGVECTOR_MIN_CONNECTION,
                 max_connection=dify_config.PGVECTOR_MAX_CONNECTION,
+                pg_bigm=dify_config.PGVECTOR_PG_BIGM,
             ),
         )

+ 2 - 0
docker/.env.example

@@ -431,6 +431,8 @@ PGVECTOR_PASSWORD=difyai123456
 PGVECTOR_DATABASE=dify
 PGVECTOR_MIN_CONNECTION=1
 PGVECTOR_MAX_CONNECTION=5
+PGVECTOR_PG_BIGM=false
+PGVECTOR_PG_BIGM_VERSION=1.2-20240606
 
 # pgvecto-rs configurations, only available when VECTOR_STORE is `pgvecto-rs`
 PGVECTO_RS_HOST=pgvecto-rs

+ 5 - 0
docker/docker-compose-template.yaml

@@ -322,8 +322,13 @@ services:
       POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify}
       # postgres data directory
       PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata}
+      # pg_bigm module for full text search
+      PG_BIGM: ${PGVECTOR_PG_BIGM:-false}
+      PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606}
     volumes:
       - ./volumes/pgvector/data:/var/lib/postgresql/data
+      - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh
+    entrypoint: [ '/docker-entrypoint.sh' ]
     healthcheck:
       test: [ 'CMD', 'pg_isready' ]
       interval: 1s

+ 7 - 0
docker/docker-compose.yaml

@@ -157,6 +157,8 @@ x-shared-env: &shared-api-worker-env
   PGVECTOR_DATABASE: ${PGVECTOR_DATABASE:-dify}
   PGVECTOR_MIN_CONNECTION: ${PGVECTOR_MIN_CONNECTION:-1}
   PGVECTOR_MAX_CONNECTION: ${PGVECTOR_MAX_CONNECTION:-5}
+  PGVECTOR_PG_BIGM: ${PGVECTOR_PG_BIGM:-false}
+  PGVECTOR_PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606}
   PGVECTO_RS_HOST: ${PGVECTO_RS_HOST:-pgvecto-rs}
   PGVECTO_RS_PORT: ${PGVECTO_RS_PORT:-5432}
   PGVECTO_RS_USER: ${PGVECTO_RS_USER:-postgres}
@@ -741,8 +743,13 @@ services:
       POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify}
       # postgres data directory
       PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata}
+      # pg_bigm module for full text search
+      PG_BIGM: ${PGVECTOR_PG_BIGM:-false}
+      PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606}
     volumes:
       - ./volumes/pgvector/data:/var/lib/postgresql/data
+      - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh
+    entrypoint: [ '/docker-entrypoint.sh' ]
     healthcheck:
       test: [ 'CMD', 'pg_isready' ]
       interval: 1s

+ 24 - 0
docker/pgvector/docker-entrypoint.sh

@@ -0,0 +1,24 @@
+#!/bin/bash
+
+PG_MAJOR=16
+
+if [ "${PG_BIGM}" = "true" ]; then
+  # install pg_bigm
+  apt-get update
+  apt-get install -y curl make gcc postgresql-server-dev-${PG_MAJOR}
+
+  curl -LO https://github.com/pgbigm/pg_bigm/archive/refs/tags/v${PG_BIGM_VERSION}.tar.gz
+  tar xf v${PG_BIGM_VERSION}.tar.gz
+  cd pg_bigm-${PG_BIGM_VERSION} || exit 1
+  make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config
+  make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config install
+
+  cd - || exit 1
+  rm -rf v${PG_BIGM_VERSION}.tar.gz pg_bigm-${PG_BIGM_VERSION}
+
+  # enable pg_bigm
+  sed -i -e 's/^#\s*shared_preload_libraries.*/shared_preload_libraries = '\''pg_bigm'\''/' /var/lib/postgresql/data/pgdata/postgresql.conf
+fi
+
+# Run the original entrypoint script
+exec /usr/local/bin/docker-entrypoint.sh postgres