1 year ago · 9eaae770a6
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -1,4 +1,3 @@
 
				-import asyncio
			
 
				 import concurrent
			
 
				 import datetime
			
 
				 import json
			
@@ -8,25 +7,17 @@ import threading
 
				 import time
			
 
				 import uuid
			
 
				 from concurrent.futures import ThreadPoolExecutor
			
 
				-from multiprocessing import Process
			
 
				 from typing import Optional, List, cast
			
 
				 
			
 
				-import openai
			
 
				-from billiard.pool import Pool
			
 
				-from flask import current_app, Flask
			
 
				 from flask_login import current_user
			
 
				-from langchain.embeddings import OpenAIEmbeddings
			
 
				 from langchain.schema import Document
			
 
				 from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
			
 
				 
			
 
				 from core.data_loader.file_extractor import FileExtractor
			
 
				 from core.data_loader.loader.notion import NotionLoader
			
 
				 from core.docstore.dataset_docstore import DatesetDocumentStore
			
 
				-from core.embedding.cached_embedding import CacheEmbedding
			
 
				 from core.generator.llm_generator import LLMGenerator
			
 
				 from core.index.index import IndexBuilder
			
 
				-from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
			
 
				-from core.index.vector_index.vector_index import VectorIndex
			
 
				 from core.llm.error import ProviderTokenNotInitError
			
 
				 from core.llm.llm_builder import LLMBuilder
			
 
				 from core.llm.streamable_open_ai import StreamableOpenAI
			
@@ -516,20 +507,23 @@ class IndexingRunner:
 
				                 model_name='gpt-3.5-turbo',
			
 
				                 max_tokens=2000
			
 
				             )
			
 
				-            threads = []
			
 
				-            for doc in documents:
			
 
				-                document_format_thread = threading.Thread(target=self.format_document, kwargs={
			
 
				-                    'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form})
			
 
				-                threads.append(document_format_thread)
			
 
				-                document_format_thread.start()
			
 
				-            for thread in threads:
			
 
				-                thread.join()
			
 
				+            for i in range(0, len(documents), 10):
			
 
				+                threads = []
			
 
				+                sub_documents = documents[i:i + 10]
			
 
				+                for doc in sub_documents:
			
 
				+                    document_format_thread = threading.Thread(target=self.format_document, kwargs={
			
 
				+                        'llm': llm, 'document_node': doc, 'split_documents': split_documents,
			
 
				+                        'document_form': document_form})
			
 
				+                    threads.append(document_format_thread)
			
 
				+                    document_format_thread.start()
			
 
				+                for thread in threads:
			
 
				+                    thread.join()
			
 
				+
			
 
				             all_documents.extend(split_documents)
			
 
				 
			
 
				         return all_documents
			
 
				 
			
 
				-    def format_document(self, llm: StreamableOpenAI, document_node, split_documents: List, document_form: str):
			
 
				-        print(document_node.page_content)
			
 
				+    def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str):
			
 
				         format_documents = []
			
 
				         if document_node.page_content is None or not document_node.page_content.strip():
			
 
				             return format_documents