|
@@ -760,166 +760,168 @@ class DocumentService:
|
|
|
)
|
|
|
db.session.add(dataset_process_rule)
|
|
|
db.session.commit()
|
|
|
- position = DocumentService.get_documents_position(dataset.id)
|
|
|
- document_ids = []
|
|
|
- duplicate_document_ids = []
|
|
|
- if document_data["data_source"]["type"] == "upload_file":
|
|
|
- upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"]
|
|
|
- for file_id in upload_file_list:
|
|
|
- file = (
|
|
|
- db.session.query(UploadFile)
|
|
|
- .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
|
|
- .first()
|
|
|
- )
|
|
|
+ lock_name = "add_document_lock_dataset_id_{}".format(dataset.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ position = DocumentService.get_documents_position(dataset.id)
|
|
|
+ document_ids = []
|
|
|
+ duplicate_document_ids = []
|
|
|
+ if document_data["data_source"]["type"] == "upload_file":
|
|
|
+ upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"]
|
|
|
+ for file_id in upload_file_list:
|
|
|
+ file = (
|
|
|
+ db.session.query(UploadFile)
|
|
|
+ .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
|
|
+ .first()
|
|
|
+ )
|
|
|
|
|
|
- # raise error if file not found
|
|
|
- if not file:
|
|
|
- raise FileNotExistsError()
|
|
|
+ # raise error if file not found
|
|
|
+ if not file:
|
|
|
+ raise FileNotExistsError()
|
|
|
|
|
|
- file_name = file.name
|
|
|
- data_source_info = {
|
|
|
- "upload_file_id": file_id,
|
|
|
- }
|
|
|
- # check duplicate
|
|
|
- if document_data.get("duplicate", False):
|
|
|
- document = Document.query.filter_by(
|
|
|
- dataset_id=dataset.id,
|
|
|
- tenant_id=current_user.current_tenant_id,
|
|
|
- data_source_type="upload_file",
|
|
|
- enabled=True,
|
|
|
- name=file_name,
|
|
|
- ).first()
|
|
|
- if document:
|
|
|
- document.dataset_process_rule_id = dataset_process_rule.id
|
|
|
- document.updated_at = datetime.datetime.utcnow()
|
|
|
- document.created_from = created_from
|
|
|
- document.doc_form = document_data["doc_form"]
|
|
|
- document.doc_language = document_data["doc_language"]
|
|
|
- document.data_source_info = json.dumps(data_source_info)
|
|
|
- document.batch = batch
|
|
|
- document.indexing_status = "waiting"
|
|
|
- db.session.add(document)
|
|
|
- documents.append(document)
|
|
|
- duplicate_document_ids.append(document.id)
|
|
|
- continue
|
|
|
- document = DocumentService.build_document(
|
|
|
- dataset,
|
|
|
- dataset_process_rule.id,
|
|
|
- document_data["data_source"]["type"],
|
|
|
- document_data["doc_form"],
|
|
|
- document_data["doc_language"],
|
|
|
- data_source_info,
|
|
|
- created_from,
|
|
|
- position,
|
|
|
- account,
|
|
|
- file_name,
|
|
|
- batch,
|
|
|
- )
|
|
|
- db.session.add(document)
|
|
|
- db.session.flush()
|
|
|
- document_ids.append(document.id)
|
|
|
- documents.append(document)
|
|
|
- position += 1
|
|
|
- elif document_data["data_source"]["type"] == "notion_import":
|
|
|
- notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
|
|
|
- exist_page_ids = []
|
|
|
- exist_document = {}
|
|
|
- documents = Document.query.filter_by(
|
|
|
- dataset_id=dataset.id,
|
|
|
- tenant_id=current_user.current_tenant_id,
|
|
|
- data_source_type="notion_import",
|
|
|
- enabled=True,
|
|
|
- ).all()
|
|
|
- if documents:
|
|
|
- for document in documents:
|
|
|
- data_source_info = json.loads(document.data_source_info)
|
|
|
- exist_page_ids.append(data_source_info["notion_page_id"])
|
|
|
- exist_document[data_source_info["notion_page_id"]] = document.id
|
|
|
- for notion_info in notion_info_list:
|
|
|
- workspace_id = notion_info["workspace_id"]
|
|
|
- data_source_binding = DataSourceOauthBinding.query.filter(
|
|
|
- db.and_(
|
|
|
- DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
|
|
|
- DataSourceOauthBinding.provider == "notion",
|
|
|
- DataSourceOauthBinding.disabled == False,
|
|
|
- DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
|
|
|
+ file_name = file.name
|
|
|
+ data_source_info = {
|
|
|
+ "upload_file_id": file_id,
|
|
|
+ }
|
|
|
+ # check duplicate
|
|
|
+ if document_data.get("duplicate", False):
|
|
|
+ document = Document.query.filter_by(
|
|
|
+ dataset_id=dataset.id,
|
|
|
+ tenant_id=current_user.current_tenant_id,
|
|
|
+ data_source_type="upload_file",
|
|
|
+ enabled=True,
|
|
|
+ name=file_name,
|
|
|
+ ).first()
|
|
|
+ if document:
|
|
|
+ document.dataset_process_rule_id = dataset_process_rule.id
|
|
|
+ document.updated_at = datetime.datetime.utcnow()
|
|
|
+ document.created_from = created_from
|
|
|
+ document.doc_form = document_data["doc_form"]
|
|
|
+ document.doc_language = document_data["doc_language"]
|
|
|
+ document.data_source_info = json.dumps(data_source_info)
|
|
|
+ document.batch = batch
|
|
|
+ document.indexing_status = "waiting"
|
|
|
+ db.session.add(document)
|
|
|
+ documents.append(document)
|
|
|
+ duplicate_document_ids.append(document.id)
|
|
|
+ continue
|
|
|
+ document = DocumentService.build_document(
|
|
|
+ dataset,
|
|
|
+ dataset_process_rule.id,
|
|
|
+ document_data["data_source"]["type"],
|
|
|
+ document_data["doc_form"],
|
|
|
+ document_data["doc_language"],
|
|
|
+ data_source_info,
|
|
|
+ created_from,
|
|
|
+ position,
|
|
|
+ account,
|
|
|
+ file_name,
|
|
|
+ batch,
|
|
|
)
|
|
|
- ).first()
|
|
|
- if not data_source_binding:
|
|
|
- raise ValueError("Data source binding not found.")
|
|
|
- for page in notion_info["pages"]:
|
|
|
- if page["page_id"] not in exist_page_ids:
|
|
|
- data_source_info = {
|
|
|
- "notion_workspace_id": workspace_id,
|
|
|
- "notion_page_id": page["page_id"],
|
|
|
- "notion_page_icon": page["page_icon"],
|
|
|
- "type": page["type"],
|
|
|
- }
|
|
|
- document = DocumentService.build_document(
|
|
|
- dataset,
|
|
|
- dataset_process_rule.id,
|
|
|
- document_data["data_source"]["type"],
|
|
|
- document_data["doc_form"],
|
|
|
- document_data["doc_language"],
|
|
|
- data_source_info,
|
|
|
- created_from,
|
|
|
- position,
|
|
|
- account,
|
|
|
- page["page_name"],
|
|
|
- batch,
|
|
|
+ db.session.add(document)
|
|
|
+ db.session.flush()
|
|
|
+ document_ids.append(document.id)
|
|
|
+ documents.append(document)
|
|
|
+ position += 1
|
|
|
+ elif document_data["data_source"]["type"] == "notion_import":
|
|
|
+ notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
|
|
|
+ exist_page_ids = []
|
|
|
+ exist_document = {}
|
|
|
+ documents = Document.query.filter_by(
|
|
|
+ dataset_id=dataset.id,
|
|
|
+ tenant_id=current_user.current_tenant_id,
|
|
|
+ data_source_type="notion_import",
|
|
|
+ enabled=True,
|
|
|
+ ).all()
|
|
|
+ if documents:
|
|
|
+ for document in documents:
|
|
|
+ data_source_info = json.loads(document.data_source_info)
|
|
|
+ exist_page_ids.append(data_source_info["notion_page_id"])
|
|
|
+ exist_document[data_source_info["notion_page_id"]] = document.id
|
|
|
+ for notion_info in notion_info_list:
|
|
|
+ workspace_id = notion_info["workspace_id"]
|
|
|
+ data_source_binding = DataSourceOauthBinding.query.filter(
|
|
|
+ db.and_(
|
|
|
+ DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
|
|
|
+ DataSourceOauthBinding.provider == "notion",
|
|
|
+ DataSourceOauthBinding.disabled == False,
|
|
|
+ DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
|
|
|
)
|
|
|
- db.session.add(document)
|
|
|
- db.session.flush()
|
|
|
- document_ids.append(document.id)
|
|
|
- documents.append(document)
|
|
|
- position += 1
|
|
|
+ ).first()
|
|
|
+ if not data_source_binding:
|
|
|
+ raise ValueError("Data source binding not found.")
|
|
|
+ for page in notion_info["pages"]:
|
|
|
+ if page["page_id"] not in exist_page_ids:
|
|
|
+ data_source_info = {
|
|
|
+ "notion_workspace_id": workspace_id,
|
|
|
+ "notion_page_id": page["page_id"],
|
|
|
+ "notion_page_icon": page["page_icon"],
|
|
|
+ "type": page["type"],
|
|
|
+ }
|
|
|
+ document = DocumentService.build_document(
|
|
|
+ dataset,
|
|
|
+ dataset_process_rule.id,
|
|
|
+ document_data["data_source"]["type"],
|
|
|
+ document_data["doc_form"],
|
|
|
+ document_data["doc_language"],
|
|
|
+ data_source_info,
|
|
|
+ created_from,
|
|
|
+ position,
|
|
|
+ account,
|
|
|
+ page["page_name"],
|
|
|
+ batch,
|
|
|
+ )
|
|
|
+ db.session.add(document)
|
|
|
+ db.session.flush()
|
|
|
+ document_ids.append(document.id)
|
|
|
+ documents.append(document)
|
|
|
+ position += 1
|
|
|
+ else:
|
|
|
+ exist_document.pop(page["page_id"])
|
|
|
+ # delete not selected documents
|
|
|
+ if len(exist_document) > 0:
|
|
|
+ clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
|
|
|
+ elif document_data["data_source"]["type"] == "website_crawl":
|
|
|
+ website_info = document_data["data_source"]["info_list"]["website_info_list"]
|
|
|
+ urls = website_info["urls"]
|
|
|
+ for url in urls:
|
|
|
+ data_source_info = {
|
|
|
+ "url": url,
|
|
|
+ "provider": website_info["provider"],
|
|
|
+ "job_id": website_info["job_id"],
|
|
|
+ "only_main_content": website_info.get("only_main_content", False),
|
|
|
+ "mode": "crawl",
|
|
|
+ }
|
|
|
+ if len(url) > 255:
|
|
|
+ document_name = url[:200] + "..."
|
|
|
else:
|
|
|
- exist_document.pop(page["page_id"])
|
|
|
- # delete not selected documents
|
|
|
- if len(exist_document) > 0:
|
|
|
- clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
|
|
|
- elif document_data["data_source"]["type"] == "website_crawl":
|
|
|
- website_info = document_data["data_source"]["info_list"]["website_info_list"]
|
|
|
- urls = website_info["urls"]
|
|
|
- for url in urls:
|
|
|
- data_source_info = {
|
|
|
- "url": url,
|
|
|
- "provider": website_info["provider"],
|
|
|
- "job_id": website_info["job_id"],
|
|
|
- "only_main_content": website_info.get("only_main_content", False),
|
|
|
- "mode": "crawl",
|
|
|
- }
|
|
|
- if len(url) > 255:
|
|
|
- document_name = url[:200] + "..."
|
|
|
- else:
|
|
|
- document_name = url
|
|
|
- document = DocumentService.build_document(
|
|
|
- dataset,
|
|
|
- dataset_process_rule.id,
|
|
|
- document_data["data_source"]["type"],
|
|
|
- document_data["doc_form"],
|
|
|
- document_data["doc_language"],
|
|
|
- data_source_info,
|
|
|
- created_from,
|
|
|
- position,
|
|
|
- account,
|
|
|
- document_name,
|
|
|
- batch,
|
|
|
- )
|
|
|
- db.session.add(document)
|
|
|
- db.session.flush()
|
|
|
- document_ids.append(document.id)
|
|
|
- documents.append(document)
|
|
|
- position += 1
|
|
|
- db.session.commit()
|
|
|
+ document_name = url
|
|
|
+ document = DocumentService.build_document(
|
|
|
+ dataset,
|
|
|
+ dataset_process_rule.id,
|
|
|
+ document_data["data_source"]["type"],
|
|
|
+ document_data["doc_form"],
|
|
|
+ document_data["doc_language"],
|
|
|
+ data_source_info,
|
|
|
+ created_from,
|
|
|
+ position,
|
|
|
+ account,
|
|
|
+ document_name,
|
|
|
+ batch,
|
|
|
+ )
|
|
|
+ db.session.add(document)
|
|
|
+ db.session.flush()
|
|
|
+ document_ids.append(document.id)
|
|
|
+ documents.append(document)
|
|
|
+ position += 1
|
|
|
+ db.session.commit()
|
|
|
|
|
|
- # trigger async task
|
|
|
- if document_ids:
|
|
|
- document_indexing_task.delay(dataset.id, document_ids)
|
|
|
- if duplicate_document_ids:
|
|
|
- duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
|
|
|
+ # trigger async task
|
|
|
+ if document_ids:
|
|
|
+ document_indexing_task.delay(dataset.id, document_ids)
|
|
|
+ if duplicate_document_ids:
|
|
|
+ duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
|
|
|
|
|
|
- return documents, batch
|
|
|
+ return documents, batch
|
|
|
|
|
|
@staticmethod
|
|
|
def check_documents_upload_quota(count: int, features: FeatureModel):
|