3 ay önce · d2586278d6
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@@ -640,6 +640,7 @@ class DatasetRetrievalSettingApi(Resource):
 
				                 | VectorType.MYSCALE
			
 
				                 | VectorType.ORACLE
			
 
				                 | VectorType.ELASTICSEARCH
			
 
				+                | VectorType.ELASTICSEARCH_JA
			
 
				                 | VectorType.PGVECTOR
			
 
				                 | VectorType.TIDB_ON_QDRANT
			
 
				                 | VectorType.LINDORM
			
@@ -683,6 +684,7 @@ class DatasetRetrievalSettingMockApi(Resource):
 
				                 | VectorType.MYSCALE
			
 
				                 | VectorType.ORACLE
			
 
				                 | VectorType.ELASTICSEARCH
			
 
				+                | VectorType.ELASTICSEARCH_JA
			
 
				                 | VectorType.COUCHBASE
			
 
				                 | VectorType.PGVECTOR
			
 
				                 | VectorType.LINDORM
			
--- a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py
+++ b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py
@@ -0,0 +1,104 @@
 
				+import json
			
 
				+import logging
			
 
				+from typing import Any, Optional
			
 
				+
			
 
				+from flask import current_app
			
 
				+
			
 
				+from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import (
			
 
				+    ElasticSearchConfig,
			
 
				+    ElasticSearchVector,
			
 
				+    ElasticSearchVectorFactory,
			
 
				+)
			
 
				+from core.rag.datasource.vdb.field import Field
			
 
				+from core.rag.datasource.vdb.vector_type import VectorType
			
 
				+from core.rag.embedding.embedding_base import Embeddings
			
 
				+from extensions.ext_redis import redis_client
			
 
				+from models.dataset import Dataset
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class ElasticSearchJaVector(ElasticSearchVector):
			
 
				+    def create_collection(
			
 
				+        self,
			
 
				+        embeddings: list[list[float]],
			
 
				+        metadatas: Optional[list[dict[Any, Any]]] = None,
			
 
				+        index_params: Optional[dict] = None,
			
 
				+    ):
			
 
				+        lock_name = f"vector_indexing_lock_{self._collection_name}"
			
 
				+        with redis_client.lock(lock_name, timeout=20):
			
 
				+            collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
			
 
				+            if redis_client.get(collection_exist_cache_key):
			
 
				+                logger.info(f"Collection {self._collection_name} already exists.")
			
 
				+                return
			
 
				+
			
 
				+            if not self._client.indices.exists(index=self._collection_name):
			
 
				+                dim = len(embeddings[0])
			
 
				+                settings = {
			
 
				+                    "analysis": {
			
 
				+                        "analyzer": {
			
 
				+                            "ja_analyzer": {
			
 
				+                                "type": "custom",
			
 
				+                                "char_filter": [
			
 
				+                                    "icu_normalizer",
			
 
				+                                    "kuromoji_iteration_mark",
			
 
				+                                ],
			
 
				+                                "tokenizer": "kuromoji_tokenizer",
			
 
				+                                "filter": [
			
 
				+                                    "kuromoji_baseform",
			
 
				+                                    "kuromoji_part_of_speech",
			
 
				+                                    "ja_stop",
			
 
				+                                    "kuromoji_number",
			
 
				+                                    "kuromoji_stemmer",
			
 
				+                                ],
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                }
			
 
				+                mappings = {
			
 
				+                    "properties": {
			
 
				+                        Field.CONTENT_KEY.value: {
			
 
				+                            "type": "text",
			
 
				+                            "analyzer": "ja_analyzer",
			
 
				+                            "search_analyzer": "ja_analyzer",
			
 
				+                        },
			
 
				+                        Field.VECTOR.value: {  # Make sure the dimension is correct here
			
 
				+                            "type": "dense_vector",
			
 
				+                            "dims": dim,
			
 
				+                            "index": True,
			
 
				+                            "similarity": "cosine",
			
 
				+                        },
			
 
				+                        Field.METADATA_KEY.value: {
			
 
				+                            "type": "object",
			
 
				+                            "properties": {
			
 
				+                                "doc_id": {"type": "keyword"}  # Map doc_id to keyword type
			
 
				+                            },
			
 
				+                        },
			
 
				+                    }
			
 
				+                }
			
 
				+                self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings)
			
 
				+
			
 
				+            redis_client.set(collection_exist_cache_key, 1, ex=3600)
			
 
				+
			
 
				+
			
 
				+class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory):
			
 
				+    def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector:
			
 
				+        if dataset.index_struct_dict:
			
 
				+            class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
			
 
				+            collection_name = class_prefix
			
 
				+        else:
			
 
				+            dataset_id = dataset.id
			
 
				+            collection_name = Dataset.gen_collection_name_by_id(dataset_id)
			
 
				+            dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name))
			
 
				+
			
 
				+        config = current_app.config
			
 
				+        return ElasticSearchJaVector(
			
 
				+            index_name=collection_name,
			
 
				+            config=ElasticSearchConfig(
			
 
				+                host=config.get("ELASTICSEARCH_HOST", "localhost"),
			
 
				+                port=config.get("ELASTICSEARCH_PORT", 9200),
			
 
				+                username=config.get("ELASTICSEARCH_USERNAME", ""),
			
 
				+                password=config.get("ELASTICSEARCH_PASSWORD", ""),
			
 
				+            ),
			
 
				+            attributes=[],
			
 
				+        )
			
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@@ -90,6 +90,12 @@ class Vector:
 
				                 from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory
			
 
				 
			
 
				                 return ElasticSearchVectorFactory
			
 
				+            case VectorType.ELASTICSEARCH_JA:
			
 
				+                from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import (
			
 
				+                    ElasticSearchJaVectorFactory,
			
 
				+                )
			
 
				+
			
 
				+                return ElasticSearchJaVectorFactory
			
 
				             case VectorType.TIDB_VECTOR:
			
 
				                 from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory
			
 
				 
			
--- a/api/core/rag/datasource/vdb/vector_type.py
+++ b/api/core/rag/datasource/vdb/vector_type.py
@@ -16,6 +16,7 @@ class VectorType(StrEnum):
 
				     TENCENT = "tencent"
			
 
				     ORACLE = "oracle"
			
 
				     ELASTICSEARCH = "elasticsearch"
			
 
				+    ELASTICSEARCH_JA = "elasticsearch-ja"
			
 
				     LINDORM = "lindorm"
			
 
				     COUCHBASE = "couchbase"
			
 
				     BAIDU = "baidu"
			
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -383,7 +383,7 @@ SUPABASE_URL=your-server-url
 
				 # ------------------------------
			
 
				 
			
 
				 # The type of vector store to use.
			
 
				-# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
			
 
				+# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
			
 
				 VECTOR_STORE=weaviate
			
 
				 
			
 
				 # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
			
@@ -512,7 +512,7 @@ TENCENT_VECTOR_DB_SHARD=1
 
				 TENCENT_VECTOR_DB_REPLICAS=2
			
 
				 
			
 
				 # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch`
			
 
				-ELASTICSEARCH_HOST=0.0.0.0
			
 
				+ELASTICSEARCH_HOST=elasticsearch
			
 
				 ELASTICSEARCH_PORT=9200
			
 
				 ELASTICSEARCH_USERNAME=elastic
			
 
				 ELASTICSEARCH_PASSWORD=elastic
			
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -883,20 +883,28 @@ services:
 
				     container_name: elasticsearch
			
 
				     profiles:
			
 
				       - elasticsearch
			
 
				+      - elasticsearch-ja
			
 
				     restart: always
			
 
				     volumes:
			
 
				+      - ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh
			
 
				       - dify_es01_data:/usr/share/elasticsearch/data
			
 
				     environment:
			
 
				       ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic}
			
 
				+      VECTOR_STORE: ${VECTOR_STORE:-}
			
 
				       cluster.name: dify-es-cluster
			
 
				       node.name: dify-es0
			
 
				       discovery.type: single-node
			
 
				-      xpack.license.self_generated.type: trial
			
 
				+      xpack.license.self_generated.type: basic
			
 
				       xpack.security.enabled: 'true'
			
 
				       xpack.security.enrollment.enabled: 'false'
			
 
				       xpack.security.http.ssl.enabled: 'false'
			
 
				     ports:
			
 
				       - ${ELASTICSEARCH_PORT:-9200}:9200
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        limits:
			
 
				+          memory: 2g
			
 
				+    entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ]
			
 
				     healthcheck:
			
 
				       test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ]
			
 
				       interval: 30s
			
--- a/docker/elasticsearch/docker-entrypoint.sh
+++ b/docker/elasticsearch/docker-entrypoint.sh
@@ -0,0 +1,25 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then
			
 
				+    # Check if the ICU tokenizer plugin is installed
			
 
				+    if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then
			
 
				+        printf '%s\n' "Installing the ICU tokenizer plugin"
			
 
				+        if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then
			
 
				+            printf '%s\n' "Failed to install the ICU tokenizer plugin"
			
 
				+            exit 1
			
 
				+        fi
			
 
				+    fi
			
 
				+    # Check if the Japanese language analyzer plugin is installed
			
 
				+    if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then
			
 
				+        printf '%s\n' "Installing the Japanese language analyzer plugin"
			
 
				+        if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then
			
 
				+            printf '%s\n' "Failed to install the Japanese language analyzer plugin"
			
 
				+            exit 1
			
 
				+        fi
			
 
				+    fi
			
 
				+fi
			
 
				+
			
 
				+# Run the original entrypoint script
			
 
				+exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh