Преглед на файлове

feat(website-crawl): add jina reader as additional alternative for website crawling (#8761)

Zhaofeng Miao преди 6 месеца
родител
ревизия
369e1e6f58
променени са 38 файла, в които са добавени 927 реда и са изтрити 75 реда
  1. 4 2
      api/controllers/console/datasets/website.py
  2. 10 0
      api/core/rag/extractor/extract_processor.py
  3. 35 0
      api/core/rag/extractor/jina_reader_extractor.py
  4. 3 0
      api/services/auth/api_key_auth_factory.py
  5. 44 0
      api/services/auth/jina.py
  6. 100 0
      api/services/website_service.py
  7. BIN
      web/app/components/datasets/create/assets/jina.png
  8. 8 4
      web/app/components/datasets/create/index.tsx
  9. 7 4
      web/app/components/datasets/create/step-one/index.tsx
  10. 7 4
      web/app/components/datasets/create/step-two/index.tsx
  11. 11 0
      web/app/components/datasets/create/website/base/checkbox-with-label.tsx
  12. 0 0
      web/app/components/datasets/create/website/base/crawled-result-item.tsx
  13. 1 1
      web/app/components/datasets/create/website/base/crawled-result.tsx
  14. 0 0
      web/app/components/datasets/create/website/base/crawling.tsx
  15. 0 0
      web/app/components/datasets/create/website/base/error-message.tsx
  16. 0 0
      web/app/components/datasets/create/website/base/field.tsx
  17. 0 0
      web/app/components/datasets/create/website/base/input.tsx
  18. 0 0
      web/app/components/datasets/create/website/base/mock-crawl-result.ts
  19. 0 0
      web/app/components/datasets/create/website/base/options-wrap.tsx
  20. 0 0
      web/app/components/datasets/create/website/base/url-input.tsx
  21. 5 5
      web/app/components/datasets/create/website/firecrawl/index.tsx
  22. 2 2
      web/app/components/datasets/create/website/firecrawl/options.tsx
  23. 6 0
      web/app/components/datasets/create/website/index.module.css
  24. 83 17
      web/app/components/datasets/create/website/index.tsx
  25. 42 0
      web/app/components/datasets/create/website/jina-reader/header.tsx
  26. 232 0
      web/app/components/datasets/create/website/jina-reader/index.tsx
  27. 59 0
      web/app/components/datasets/create/website/jina-reader/options.tsx
  28. 33 12
      web/app/components/datasets/create/website/no-data.tsx
  29. 1 1
      web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx
  30. 140 0
      web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx
  31. 35 16
      web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx
  32. 3 1
      web/app/components/header/account-setting/data-source-page/index.tsx
  33. 4 1
      web/app/components/header/account-setting/data-source-page/panel/index.tsx
  34. 14 1
      web/i18n/en-US/dataset-creation.ts
  35. 14 1
      web/i18n/zh-Hans/dataset-creation.ts
  36. 1 0
      web/models/common.ts
  37. 1 0
      web/models/datasets.ts
  38. 22 3
      web/service/datasets.ts

+ 4 - 2
api/controllers/console/datasets/website.py

@@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource):
     @account_initialization_required
     def post(self):
         parser = reqparse.RequestParser()
-        parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json")
+        parser.add_argument(
+            "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
+        )
         parser.add_argument("url", type=str, required=True, nullable=True, location="json")
         parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
         args = parser.parse_args()
@@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource):
     @account_initialization_required
     def get(self, job_id: str):
         parser = reqparse.RequestParser()
-        parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args")
+        parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
         args = parser.parse_args()
         # get crawl status
         try:

+ 10 - 0
api/core/rag/extractor/extract_processor.py

@@ -12,6 +12,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.extractor.excel_extractor import ExcelExtractor
 from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
 from core.rag.extractor.html_extractor import HtmlExtractor
+from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
 from core.rag.extractor.markdown_extractor import MarkdownExtractor
 from core.rag.extractor.notion_extractor import NotionExtractor
 from core.rag.extractor.pdf_extractor import PdfExtractor
@@ -171,6 +172,15 @@ class ExtractProcessor:
                     only_main_content=extract_setting.website_info.only_main_content,
                 )
                 return extractor.extract()
+            elif extract_setting.website_info.provider == "jinareader":
+                extractor = JinaReaderWebExtractor(
+                    url=extract_setting.website_info.url,
+                    job_id=extract_setting.website_info.job_id,
+                    tenant_id=extract_setting.website_info.tenant_id,
+                    mode=extract_setting.website_info.mode,
+                    only_main_content=extract_setting.website_info.only_main_content,
+                )
+                return extractor.extract()
             else:
                 raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
         else:

+ 35 - 0
api/core/rag/extractor/jina_reader_extractor.py

@@ -0,0 +1,35 @@
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+from services.website_service import WebsiteService
+
+
+class JinaReaderWebExtractor(BaseExtractor):
+    """
+    Crawl and scrape websites and return content in clean llm-ready markdown.
+    """
+
+    def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
+        """Initialize with url, api_key, base_url and mode."""
+        self._url = url
+        self.job_id = job_id
+        self.tenant_id = tenant_id
+        self.mode = mode
+        self.only_main_content = only_main_content
+
+    def extract(self) -> list[Document]:
+        """Extract content from the URL."""
+        documents = []
+        if self.mode == "crawl":
+            crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
+            if crawl_data is None:
+                return []
+            document = Document(
+                page_content=crawl_data.get("content", ""),
+                metadata={
+                    "source_url": crawl_data.get("url"),
+                    "description": crawl_data.get("description"),
+                    "title": crawl_data.get("title"),
+                },
+            )
+            documents.append(document)
+        return documents

+ 3 - 0
api/services/auth/api_key_auth_factory.py

@@ -1,10 +1,13 @@
 from services.auth.firecrawl import FirecrawlAuth
+from services.auth.jina import JinaAuth
 
 
 class ApiKeyAuthFactory:
     def __init__(self, provider: str, credentials: dict):
         if provider == "firecrawl":
             self.auth = FirecrawlAuth(credentials)
+        elif provider == "jinareader":
+            self.auth = JinaAuth(credentials)
         else:
             raise ValueError("Invalid provider")
 

+ 44 - 0
api/services/auth/jina.py

@@ -0,0 +1,44 @@
+import json
+
+import requests
+
+from services.auth.api_key_auth_base import ApiKeyAuthBase
+
+
+class JinaAuth(ApiKeyAuthBase):
+    def __init__(self, credentials: dict):
+        super().__init__(credentials)
+        auth_type = credentials.get("auth_type")
+        if auth_type != "bearer":
+            raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer")
+        self.api_key = credentials.get("config").get("api_key", None)
+
+        if not self.api_key:
+            raise ValueError("No API key provided")
+
+    def validate_credentials(self):
+        headers = self._prepare_headers()
+        options = {
+            "url": "https://example.com",
+        }
+        response = self._post_request("https://r.jina.ai", options, headers)
+        if response.status_code == 200:
+            return True
+        else:
+            self._handle_error(response)
+
+    def _prepare_headers(self):
+        return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
+
+    def _post_request(self, url, data, headers):
+        return requests.post(url, headers=headers, json=data)
+
+    def _handle_error(self, response):
+        if response.status_code in {402, 409, 500}:
+            error_message = response.json().get("error", "Unknown error occurred")
+            raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
+        else:
+            if response.text:
+                error_message = json.loads(response.text).get("error", "Unknown error occurred")
+                raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
+            raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")

+ 100 - 0
api/services/website_service.py

@@ -1,6 +1,7 @@
 import datetime
 import json
 
+import requests
 from flask_login import current_user
 
 from core.helper import encrypter
@@ -65,6 +66,35 @@ class WebsiteService:
             time = str(datetime.datetime.now().timestamp())
             redis_client.setex(website_crawl_time_cache_key, 3600, time)
             return {"status": "active", "job_id": job_id}
+        elif provider == "jinareader":
+            api_key = encrypter.decrypt_token(
+                tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
+            )
+            crawl_sub_pages = options.get("crawl_sub_pages", False)
+            if not crawl_sub_pages:
+                response = requests.get(
+                    f"https://r.jina.ai/{url}",
+                    headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
+                )
+                if response.json().get("code") != 200:
+                    raise ValueError("Failed to crawl")
+                return {"status": "active", "data": response.json().get("data")}
+            else:
+                response = requests.post(
+                    "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
+                    json={
+                        "url": url,
+                        "maxPages": options.get("limit", 1),
+                        "useSitemap": options.get("use_sitemap", True),
+                    },
+                    headers={
+                        "Content-Type": "application/json",
+                        "Authorization": f"Bearer {api_key}",
+                    },
+                )
+                if response.json().get("code") != 200:
+                    raise ValueError("Failed to crawl")
+                return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
         else:
             raise ValueError("Invalid provider")
 
@@ -93,6 +123,42 @@ class WebsiteService:
                     time_consuming = abs(end_time - float(start_time))
                     crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
                     redis_client.delete(website_crawl_time_cache_key)
+        elif provider == "jinareader":
+            api_key = encrypter.decrypt_token(
+                tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
+            )
+            response = requests.post(
+                "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
+                headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
+                json={"taskId": job_id},
+            )
+            data = response.json().get("data", {})
+            crawl_status_data = {
+                "status": data.get("status", "active"),
+                "job_id": job_id,
+                "total": len(data.get("urls", [])),
+                "current": len(data.get("processed", [])) + len(data.get("failed", [])),
+                "data": [],
+                "time_consuming": data.get("duration", 0) / 1000,
+            }
+
+            if crawl_status_data["status"] == "completed":
+                response = requests.post(
+                    "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
+                    headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
+                    json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
+                )
+                data = response.json().get("data", {})
+                formatted_data = [
+                    {
+                        "title": item.get("data", {}).get("title"),
+                        "source_url": item.get("data", {}).get("url"),
+                        "description": item.get("data", {}).get("description"),
+                        "markdown": item.get("data", {}).get("content"),
+                    }
+                    for item in data.get("processed", {}).values()
+                ]
+                crawl_status_data["data"] = formatted_data
         else:
             raise ValueError("Invalid provider")
         return crawl_status_data
@@ -119,6 +185,40 @@ class WebsiteService:
                     if item.get("source_url") == url:
                         return item
             return None
+        elif provider == "jinareader":
+            file_key = "website_files/" + job_id + ".txt"
+            if storage.exists(file_key):
+                data = storage.load_once(file_key)
+                if data:
+                    data = json.loads(data.decode("utf-8"))
+            elif not job_id:
+                response = requests.get(
+                    f"https://r.jina.ai/{url}",
+                    headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
+                )
+                if response.json().get("code") != 200:
+                    raise ValueError("Failed to crawl")
+                return response.json().get("data")
+            else:
+                api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
+                response = requests.post(
+                    "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
+                    headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
+                    json={"taskId": job_id},
+                )
+                data = response.json().get("data", {})
+                if data.get("status") != "completed":
+                    raise ValueError("Crawl job is not completed")
+
+                response = requests.post(
+                    "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
+                    headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
+                    json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
+                )
+                data = response.json().get("data", {})
+                for item in data.get("processed", {}).values():
+                    if item.get("data", {}).get("url") == url:
+                        return item.get("data", {})
         else:
             raise ValueError("Invalid provider")
 

BIN
web/app/components/datasets/create/assets/jina.png


+ 8 - 4
web/app/components/datasets/create/index.tsx

@@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets'
 import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
 import { fetchDataSource } from '@/service/common'
 import { fetchDatasetDetail } from '@/service/datasets'
-import type { NotionPage } from '@/models/common'
+import { DataSourceProvider, type NotionPage } from '@/models/common'
 import { useModalContext } from '@/context/modal-context'
 import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
 
@@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
   excludes: '',
   limit: 10,
   max_depth: '',
+  use_sitemap: true,
 }
 
 const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
@@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
   const updateFileList = (preparedFiles: FileItem[]) => {
     setFiles(preparedFiles)
   }
-  const [fireCrawlJobId, setFireCrawlJobId] = useState('')
+  const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl)
+  const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')
 
   const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
     const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
@@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
             onStepChange={nextStep}
             websitePages={websitePages}
             updateWebsitePages={setWebsitePages}
-            onFireCrawlJobIdChange={setFireCrawlJobId}
+            onWebsiteCrawlProviderChange={setWebsiteCrawlProvider}
+            onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId}
             crawlOptions={crawlOptions}
             onCrawlOptionsChange={setCrawlOptions}
           />
@@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
           files={fileList.map(file => file.file)}
           notionPages={notionPages}
           websitePages={websitePages}
-          fireCrawlJobId={fireCrawlJobId}
+          websiteCrawlProvider={websiteCrawlProvider}
+          websiteCrawlJobId={websiteCrawlJobId}
           onStepChange={changeStep}
           updateIndexingTypeCache={updateIndexingTypeCache}
           updateResultCache={updateResultCache}

+ 7 - 4
web/app/components/datasets/create/step-one/index.tsx

@@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview'
 import s from './index.module.css'
 import cn from '@/utils/classnames'
 import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
-import type { NotionPage } from '@/models/common'
+import type { DataSourceProvider, NotionPage } from '@/models/common'
 import { DataSourceType } from '@/models/datasets'
 import Button from '@/app/components/base/button'
 import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
@@ -33,7 +33,8 @@ type IStepOneProps = {
   changeType: (type: DataSourceType) => void
   websitePages?: CrawlResultItem[]
   updateWebsitePages: (value: CrawlResultItem[]) => void
-  onFireCrawlJobIdChange: (jobId: string) => void
+  onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void
+  onWebsiteCrawlJobIdChange: (jobId: string) => void
   crawlOptions: CrawlOptions
   onCrawlOptionsChange: (payload: CrawlOptions) => void
 }
@@ -69,7 +70,8 @@ const StepOne = ({
   updateNotionPages,
   websitePages = [],
   updateWebsitePages,
-  onFireCrawlJobIdChange,
+  onWebsiteCrawlProviderChange,
+  onWebsiteCrawlJobIdChange,
   crawlOptions,
   onCrawlOptionsChange,
 }: IStepOneProps) => {
@@ -229,7 +231,8 @@ const StepOne = ({
                   onPreview={setCurrentWebsite}
                   checkedCrawlResult={websitePages}
                   onCheckedCrawlResultChange={updateWebsitePages}
-                  onJobIdChange={onFireCrawlJobIdChange}
+                  onCrawlProviderChange={onWebsiteCrawlProviderChange}
+                  onJobIdChange={onWebsiteCrawlJobIdChange}
                   crawlOptions={crawlOptions}
                   onCrawlOptionsChange={onCrawlOptionsChange}
                 />

+ 7 - 4
web/app/components/datasets/create/step-two/index.tsx

@@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
 import Toast from '@/app/components/base/toast'
 import { formatNumber } from '@/utils/format'
 import type { NotionPage } from '@/models/common'
+import { DataSourceProvider } from '@/models/common'
 import { DataSourceType, DocForm } from '@/models/datasets'
 import NotionIcon from '@/app/components/base/notion-icon'
 import Switch from '@/app/components/base/switch'
@@ -63,7 +64,8 @@ type StepTwoProps = {
   notionPages?: NotionPage[]
   websitePages?: CrawlResultItem[]
   crawlOptions?: CrawlOptions
-  fireCrawlJobId?: string
+  websiteCrawlProvider?: DataSourceProvider
+  websiteCrawlJobId?: string
   onStepChange?: (delta: number) => void
   updateIndexingTypeCache?: (type: string) => void
   updateResultCache?: (res: createDocumentResponse) => void
@@ -94,7 +96,8 @@ const StepTwo = ({
   notionPages = [],
   websitePages = [],
   crawlOptions,
-  fireCrawlJobId = '',
+  websiteCrawlProvider = DataSourceProvider.fireCrawl,
+  websiteCrawlJobId = '',
   onStepChange,
   updateIndexingTypeCache,
   updateResultCache,
@@ -260,8 +263,8 @@ const StepTwo = ({
 
   const getWebsiteInfo = () => {
     return {
-      provider: 'firecrawl',
-      job_id: fireCrawlJobId,
+      provider: websiteCrawlProvider,
+      job_id: websiteCrawlJobId,
       urls: websitePages.map(page => page.source_url),
       only_main_content: crawlOptions?.only_main_content,
     }

+ 11 - 0
web/app/components/datasets/create/website/firecrawl/base/checkbox-with-label.tsx → web/app/components/datasets/create/website/base/checkbox-with-label.tsx

@@ -3,6 +3,7 @@ import type { FC } from 'react'
 import React from 'react'
 import cn from '@/utils/classnames'
 import Checkbox from '@/app/components/base/checkbox'
+import Tooltip from '@/app/components/base/tooltip'
 
 type Props = {
   className?: string
@@ -10,6 +11,7 @@ type Props = {
   onChange: (isChecked: boolean) => void
   label: string
   labelClassName?: string
+  tooltip?: string
 }
 
 const CheckboxWithLabel: FC<Props> = ({
@@ -18,11 +20,20 @@ const CheckboxWithLabel: FC<Props> = ({
   onChange,
   label,
   labelClassName,
+  tooltip,
 }) => {
   return (
     <label className={cn(className, 'flex items-center h-7 space-x-2')}>
       <Checkbox checked={isChecked} onCheck={() => onChange(!isChecked)} />
       <div className={cn(labelClassName, 'text-sm font-normal text-gray-800')}>{label}</div>
+      {tooltip && (
+        <Tooltip
+          popupContent={
+            <div className='w-[200px]'>{tooltip}</div>
+          }
+          triggerClassName='ml-0.5 w-4 h-4'
+        />
+      )}
     </label>
   )
 }

+ 0 - 0
web/app/components/datasets/create/website/firecrawl/crawled-result-item.tsx → web/app/components/datasets/create/website/base/crawled-result-item.tsx


+ 1 - 1
web/app/components/datasets/create/website/firecrawl/crawled-result.tsx → web/app/components/datasets/create/website/base/crawled-result.tsx

@@ -2,7 +2,7 @@
 import type { FC } from 'react'
 import React, { useCallback } from 'react'
 import { useTranslation } from 'react-i18next'
-import CheckboxWithLabel from './base/checkbox-with-label'
+import CheckboxWithLabel from './checkbox-with-label'
 import CrawledResultItem from './crawled-result-item'
 import cn from '@/utils/classnames'
 import type { CrawlResultItem } from '@/models/datasets'

+ 0 - 0
web/app/components/datasets/create/website/firecrawl/crawling.tsx → web/app/components/datasets/create/website/base/crawling.tsx


+ 0 - 0
web/app/components/datasets/create/website/firecrawl/base/error-message.tsx → web/app/components/datasets/create/website/base/error-message.tsx


+ 0 - 0
web/app/components/datasets/create/website/firecrawl/base/field.tsx → web/app/components/datasets/create/website/base/field.tsx


+ 0 - 0
web/app/components/datasets/create/website/firecrawl/base/input.tsx → web/app/components/datasets/create/website/base/input.tsx


+ 0 - 0
web/app/components/datasets/create/website/firecrawl/mock-crawl-result.ts → web/app/components/datasets/create/website/base/mock-crawl-result.ts


+ 0 - 0
web/app/components/datasets/create/website/firecrawl/base/options-wrap.tsx → web/app/components/datasets/create/website/base/options-wrap.tsx


+ 0 - 0
web/app/components/datasets/create/website/firecrawl/base/url-input.tsx → web/app/components/datasets/create/website/base/url-input.tsx


+ 5 - 5
web/app/components/datasets/create/website/firecrawl/index.tsx

@@ -2,13 +2,13 @@
 import type { FC } from 'react'
 import React, { useCallback, useEffect, useState } from 'react'
 import { useTranslation } from 'react-i18next'
+import UrlInput from '../base/url-input'
+import OptionsWrap from '../base/options-wrap'
+import CrawledResult from '../base/crawled-result'
+import Crawling from '../base/crawling'
+import ErrorMessage from '../base/error-message'
 import Header from './header'
-import UrlInput from './base/url-input'
-import OptionsWrap from './base/options-wrap'
 import Options from './options'
-import CrawledResult from './crawled-result'
-import Crawling from './crawling'
-import ErrorMessage from './base/error-message'
 import cn from '@/utils/classnames'
 import { useModalContext } from '@/context/modal-context'
 import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'

+ 2 - 2
web/app/components/datasets/create/website/firecrawl/options.tsx

@@ -2,8 +2,8 @@
 import type { FC } from 'react'
 import React, { useCallback } from 'react'
 import { useTranslation } from 'react-i18next'
-import CheckboxWithLabel from './base/checkbox-with-label'
-import Field from './base/field'
+import CheckboxWithLabel from '../base/checkbox-with-label'
+import Field from '../base/field'
 import cn from '@/utils/classnames'
 import type { CrawlOptions } from '@/models/datasets'
 

+ 6 - 0
web/app/components/datasets/create/website/index.module.css

@@ -0,0 +1,6 @@
+.jinaLogo {
+  @apply w-4 h-4 bg-center bg-no-repeat inline-block;
+  background-color: #F5FAFF;
+  background-image: url(../assets/jina.png);
+  background-size: 16px;
+}

+ 83 - 17
web/app/components/datasets/create/website/index.tsx

@@ -1,8 +1,12 @@
 'use client'
 import type { FC } from 'react'
 import React, { useCallback, useEffect, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import s from './index.module.css'
 import NoData from './no-data'
 import Firecrawl from './firecrawl'
+import JinaReader from './jina-reader'
+import cn from '@/utils/classnames'
 import { useModalContext } from '@/context/modal-context'
 import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
 import { fetchDataSources } from '@/service/datasets'
@@ -12,6 +16,7 @@ type Props = {
   onPreview: (payload: CrawlResultItem) => void
   checkedCrawlResult: CrawlResultItem[]
   onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
+  onCrawlProviderChange: (provider: DataSourceProvider) => void
   onJobIdChange: (jobId: string) => void
   crawlOptions: CrawlOptions
   onCrawlOptionsChange: (payload: CrawlOptions) => void
@@ -21,17 +26,32 @@ const Website: FC<Props> = ({
   onPreview,
   checkedCrawlResult,
   onCheckedCrawlResultChange,
+  onCrawlProviderChange,
   onJobIdChange,
   crawlOptions,
   onCrawlOptionsChange,
 }) => {
+  const { t } = useTranslation()
   const { setShowAccountSettingModal } = useModalContext()
   const [isLoaded, setIsLoaded] = useState(false)
-  const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false)
+  const [selectedProvider, setSelectedProvider] = useState<DataSourceProvider>(DataSourceProvider.jinaReader)
+  const [sources, setSources] = useState<DataSourceItem[]>([])
+
+  useEffect(() => {
+    onCrawlProviderChange(selectedProvider)
+  }, [selectedProvider, onCrawlProviderChange])
+
   const checkSetApiKey = useCallback(async () => {
     const res = await fetchDataSources() as any
-    const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl)
-    setIsSetFirecrawlApiKey(isFirecrawlSet)
+    setSources(res.sources)
+
+    // If users have configured one of the providers, select it.
+    const availableProviders = res.sources.filter((item: DataSourceItem) =>
+      [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
+    )
+
+    if (availableProviders.length > 0)
+      setSelectedProvider(availableProviders[0].provider)
   }, [])
 
   useEffect(() => {
@@ -52,20 +72,66 @@ const Website: FC<Props> = ({
 
   return (
     <div>
-      {isSetFirecrawlApiKey
-        ? (
-          <Firecrawl
-            onPreview={onPreview}
-            checkedCrawlResult={checkedCrawlResult}
-            onCheckedCrawlResultChange={onCheckedCrawlResultChange}
-            onJobIdChange={onJobIdChange}
-            crawlOptions={crawlOptions}
-            onCrawlOptionsChange={onCrawlOptionsChange}
-          />
-        )
-        : (
-          <NoData onConfig={handleOnConfig} />
-        )}
+      <div className="mb-4">
+        <div className="font-medium text-gray-700 mb-2 h-6">
+          {t('datasetCreation.stepOne.website.chooseProvider')}
+        </div>
+        <div className="flex space-x-2">
+          <button
+            className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
+              selectedProvider === DataSourceProvider.jinaReader
+                ? 'bg-primary-50 text-primary-600'
+                : 'bg-gray-100 text-gray-600 hover:bg-gray-200'
+            }`}
+            onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
+          >
+            <span className={cn(s.jinaLogo, 'mr-2')} />
+            <span>Jina Reader</span>
+          </button>
+          <button
+            className={`px-4 py-2 text-sm font-medium rounded-md ${
+              selectedProvider === DataSourceProvider.fireCrawl
+                ? 'bg-primary-50 text-primary-600'
+                : 'bg-gray-100 text-gray-600 hover:bg-gray-200'
+            }`}
+            onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
+          >
+            🔥 Firecrawl
+          </button>
+        </div>
+      </div>
+
+      {
+        selectedProvider === DataSourceProvider.fireCrawl
+          ? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
+            ? (
+              <Firecrawl
+                onPreview={onPreview}
+                checkedCrawlResult={checkedCrawlResult}
+                onCheckedCrawlResultChange={onCheckedCrawlResultChange}
+                onJobIdChange={onJobIdChange}
+                crawlOptions={crawlOptions}
+                onCrawlOptionsChange={onCrawlOptionsChange}
+              />
+            )
+            : (
+              <NoData onConfig={handleOnConfig} provider={selectedProvider} />
+            )
+          : sources.find(source => source.provider === DataSourceProvider.jinaReader)
+            ? (
+              <JinaReader
+                onPreview={onPreview}
+                checkedCrawlResult={checkedCrawlResult}
+                onCheckedCrawlResultChange={onCheckedCrawlResultChange}
+                onJobIdChange={onJobIdChange}
+                crawlOptions={crawlOptions}
+                onCrawlOptionsChange={onCrawlOptionsChange}
+              />
+            )
+            : (
+              <NoData onConfig={handleOnConfig} provider={selectedProvider} />
+            )
+      }
     </div>
   )
 }

+ 42 - 0
web/app/components/datasets/create/website/jina-reader/header.tsx

@@ -0,0 +1,42 @@
+'use client'
+import type { FC } from 'react'
+import React from 'react'
+import { useTranslation } from 'react-i18next'
+import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
+import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education'
+
+const I18N_PREFIX = 'datasetCreation.stepOne.website'
+
+type Props = {
+  onSetting: () => void
+}
+
+const Header: FC<Props> = ({
+  onSetting,
+}) => {
+  const { t } = useTranslation()
+
+  return (
+    <div className='flex h-6 items-center justify-between'>
+      <div className='flex items-center'>
+        <div className='text-base font-medium text-gray-700'>{t(`${I18N_PREFIX}.jinaReaderTitle`)}</div>
+        <div className='ml-2 mr-1 w-px h-3.5 bg-gray-200'></div>
+        <div
+          className='p-1 rounded-md hover:bg-black/5 cursor-pointer'
+          onClick={onSetting}
+        >
+          <Settings01 className='w-3.5 h-3.5 text-gray-500' />
+        </div>
+      </div>
+      <a
+        href='https://jina.ai/reader'
+        target='_blank' rel='noopener noreferrer'
+        className='flex items-center text-xs text-primary-600'
+      >
+        <BookOpen01 className='mr-1 w-3.5 h-3.5 text-primary-600' />
+        {t(`${I18N_PREFIX}.jinaReaderDoc`)}
+      </a>
+    </div>
+  )
+}
+export default React.memo(Header)

+ 232 - 0
web/app/components/datasets/create/website/jina-reader/index.tsx

@@ -0,0 +1,232 @@
+'use client'
+import type { FC } from 'react'
+import React, { useCallback, useEffect, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import UrlInput from '../base/url-input'
+import OptionsWrap from '../base/options-wrap'
+import CrawledResult from '../base/crawled-result'
+import Crawling from '../base/crawling'
+import ErrorMessage from '../base/error-message'
+import Header from './header'
+import Options from './options'
+import cn from '@/utils/classnames'
+import { useModalContext } from '@/context/modal-context'
+import Toast from '@/app/components/base/toast'
+import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
+import { sleep } from '@/utils'
+import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
+
+const ERROR_I18N_PREFIX = 'common.errorMsg'
+const I18N_PREFIX = 'datasetCreation.stepOne.website'
+
+type Props = {
+  onPreview: (payload: CrawlResultItem) => void
+  checkedCrawlResult: CrawlResultItem[]
+  onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
+  onJobIdChange: (jobId: string) => void
+  crawlOptions: CrawlOptions
+  onCrawlOptionsChange: (payload: CrawlOptions) => void
+}
+
+enum Step {
+  init = 'init',
+  running = 'running',
+  finished = 'finished',
+}
+
+const JinaReader: FC<Props> = ({
+  onPreview,
+  checkedCrawlResult,
+  onCheckedCrawlResultChange,
+  onJobIdChange,
+  crawlOptions,
+  onCrawlOptionsChange,
+}) => {
+  const { t } = useTranslation()
+  const [step, setStep] = useState<Step>(Step.init)
+  const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
+  useEffect(() => {
+    if (step !== Step.init)
+      setControlFoldOptions(Date.now())
+  }, [step])
+  const { setShowAccountSettingModal } = useModalContext()
+  const handleSetting = useCallback(() => {
+    setShowAccountSettingModal({
+      payload: 'data-source',
+    })
+  }, [setShowAccountSettingModal])
+
+  const checkValid = useCallback((url: string) => {
+    let errorMsg = ''
+    if (!url) {
+      errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
+        field: 'url',
+      })
+    }
+
+    if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
+      errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
+
+    if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
+      errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
+        field: t(`${I18N_PREFIX}.limit`),
+      })
+    }
+
+    return {
+      isValid: !errorMsg,
+      errorMsg,
+    }
+  }, [crawlOptions, t])
+
+  const isInit = step === Step.init
+  const isCrawlFinished = step === Step.finished
+  const isRunning = step === Step.running
+  const [crawlResult, setCrawlResult] = useState<{
+    current: number
+    total: number
+    data: CrawlResultItem[]
+    time_consuming: number | string
+  } | undefined>(undefined)
+  const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
+  const showError = isCrawlFinished && crawlErrorMessage
+
+  const waitForCrawlFinished = useCallback(async (jobId: string) => {
+    try {
+      const res = await checkJinaReaderTaskStatus(jobId) as any
+      console.log('res', res)
+      if (res.status === 'completed') {
+        return {
+          isError: false,
+          data: {
+            ...res,
+            total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
+          },
+        }
+      }
+      if (res.status === 'failed' || !res.status) {
+        return {
+          isError: true,
+          errorMessage: res.message,
+          data: {
+            data: [],
+          },
+        }
+      }
+      // update the progress
+      setCrawlResult({
+        ...res,
+        total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
+      })
+      onCheckedCrawlResultChange(res.data || []) // default select the crawl result
+      await sleep(2500)
+      return await waitForCrawlFinished(jobId)
+    }
+    catch (e: any) {
+      const errorBody = await e.json()
+      return {
+        isError: true,
+        errorMessage: errorBody.message,
+        data: {
+          data: [],
+        },
+      }
+    }
+  }, [crawlOptions.limit])
+
+  const handleRun = useCallback(async (url: string) => {
+    const { isValid, errorMsg } = checkValid(url)
+    if (!isValid) {
+      Toast.notify({
+        message: errorMsg!,
+        type: 'error',
+      })
+      return
+    }
+    setStep(Step.running)
+    try {
+      const startTime = Date.now()
+      const res = await createJinaReaderTask({
+        url,
+        options: crawlOptions,
+      }) as any
+
+      if (res.data) {
+        const data = {
+          current: 1,
+          total: 1,
+          data: [{
+            title: res.data.title,
+            markdown: res.data.content,
+            description: res.data.description,
+            source_url: res.data.url,
+          }],
+          time_consuming: (Date.now() - startTime) / 1000,
+        }
+        setCrawlResult(data)
+        onCheckedCrawlResultChange(data.data || [])
+        setCrawlErrorMessage('')
+      }
+      else if (res.job_id) {
+        const jobId = res.job_id
+        onJobIdChange(jobId)
+        const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
+        if (isError) {
+          setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
+        }
+        else {
+          setCrawlResult(data)
+          onCheckedCrawlResultChange(data.data || []) // default select the crawl result
+          setCrawlErrorMessage('')
+        }
+      }
+    }
+    catch (e) {
+      setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
+      console.log(e)
+    }
+    finally {
+      setStep(Step.finished)
+    }
+  }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
+
+  return (
+    <div>
+      <Header onSetting={handleSetting} />
+      <div className={cn('mt-2 p-4 pb-0 rounded-xl border border-gray-200')}>
+        <UrlInput onRun={handleRun} isRunning={isRunning} />
+        <OptionsWrap
+          className={cn('mt-4')}
+          controlFoldOptions={controlFoldOptions}
+        >
+          <Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
+        </OptionsWrap>
+
+        {!isInit && (
+          <div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
+            {isRunning
+              && <Crawling
+                className='mt-2'
+                crawledNum={crawlResult?.current || 0}
+                totalNum={crawlResult?.total || parseFloat(crawlOptions.limit as string) || 0}
+              />}
+            {showError && (
+              <ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
+            )}
+            {isCrawlFinished && !showError
+              && <CrawledResult
+                className='mb-2'
+                list={crawlResult?.data || []}
+                checkedList={checkedCrawlResult}
+                onSelectedChange={onCheckedCrawlResultChange}
+                onPreview={onPreview}
+                usedTime={parseFloat(crawlResult?.time_consuming as string) || 0}
+              />
+            }
+          </div>
+        )}
+      </div>
+    </div>
+  )
+}
+export default React.memo(JinaReader)

+ 59 - 0
web/app/components/datasets/create/website/jina-reader/options.tsx

@@ -0,0 +1,59 @@
+'use client'
+import type { FC } from 'react'
+import React, { useCallback } from 'react'
+import { useTranslation } from 'react-i18next'
+import CheckboxWithLabel from '../base/checkbox-with-label'
+import Field from '../base/field'
+import cn from '@/utils/classnames'
+import type { CrawlOptions } from '@/models/datasets'
+
+const I18N_PREFIX = 'datasetCreation.stepOne.website'
+
+type Props = {
+  className?: string
+  payload: CrawlOptions
+  onChange: (payload: CrawlOptions) => void
+}
+
+const Options: FC<Props> = ({
+  className = '',
+  payload,
+  onChange,
+}) => {
+  const { t } = useTranslation()
+
+  const handleChange = useCallback((key: keyof CrawlOptions) => {
+    return (value: any) => {
+      onChange({
+        ...payload,
+        [key]: value,
+      })
+    }
+  }, [payload, onChange])
+  return (
+    <div className={cn(className, ' space-y-2')}>
+      <CheckboxWithLabel
+        label={t(`${I18N_PREFIX}.crawlSubPage`)}
+        isChecked={payload.crawl_sub_pages}
+        onChange={handleChange('crawl_sub_pages')}
+      />
+      <CheckboxWithLabel
+        label={t(`${I18N_PREFIX}.useSitemap`)}
+        isChecked={payload.use_sitemap}
+        onChange={handleChange('use_sitemap')}
+        tooltip={t(`${I18N_PREFIX}.useSitemapTooltip`) as string}
+      />
+      <div className='flex justify-between space-x-4'>
+        <Field
+          className='grow shrink-0'
+          label={t(`${I18N_PREFIX}.limit`)}
+          value={payload.limit}
+          onChange={handleChange('limit')}
+          isNumber
+          isRequired
+        />
+      </div>
+    </div>
+  )
+}
+export default React.memo(Options)

+ 33 - 12
web/app/components/datasets/create/website/no-data.tsx

@@ -2,35 +2,56 @@
 import type { FC } from 'react'
 import React from 'react'
 import { useTranslation } from 'react-i18next'
+import s from './index.module.css'
 import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
 import Button from '@/app/components/base/button'
+import { DataSourceProvider } from '@/models/common'
 
 const I18N_PREFIX = 'datasetCreation.stepOne.website'
 
 type Props = {
   onConfig: () => void
+  provider: DataSourceProvider
 }
 
 const NoData: FC<Props> = ({
   onConfig,
+  provider,
 }) => {
   const { t } = useTranslation()
 
+  const providerConfig = {
+    [DataSourceProvider.jinaReader]: {
+      emoji: <span className={s.jinaLogo} />,
+      title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
+      description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
+    },
+    [DataSourceProvider.fireCrawl]: {
+      emoji: '🔥',
+      title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
+      description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
+    },
+  }
+
+  const currentProvider = providerConfig[provider]
+
   return (
-    <div className='max-w-[640px] p-6 rounded-2xl bg-gray-50'>
-      <div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
-        🔥
-      </div>
-      <div className='my-2'>
-        <span className='text-gray-700 font-semibold'>{t(`${I18N_PREFIX}.fireCrawlNotConfigured`)}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
-        <div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
-          {t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)}
+    <>
+      <div className='max-w-[640px] p-6 rounded-2xl bg-gray-50 mt-4'>
+        <div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
+          {currentProvider.emoji}
+        </div>
+        <div className='my-2'>
+          <span className='text-gray-700 font-semibold'>{currentProvider.title}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
+          <div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
+            {currentProvider.description}
+          </div>
         </div>
+        <Button variant='primary' onClick={onConfig}>
+          {t(`${I18N_PREFIX}.configure`)}
+        </Button>
       </div>
-      <Button variant='primary' onClick={onConfig}>
-        {t(`${I18N_PREFIX}.configure`)}
-      </Button>
-    </div>
+    </>
   )
 }
 export default React.memo(NoData)

+ 1 - 1
web/app/components/header/account-setting/data-source-page/data-source-website/config-firecrawl-modal.tsx

@@ -9,7 +9,7 @@ import {
 import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
 import Button from '@/app/components/base/button'
 import type { FirecrawlConfig } from '@/models/common'
-import Field from '@/app/components/datasets/create/website/firecrawl/base/field'
+import Field from '@/app/components/datasets/create/website/base/field'
 import Toast from '@/app/components/base/toast'
 import { createDataSourceApiKeyBinding } from '@/service/datasets'
 import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'

+ 140 - 0
web/app/components/header/account-setting/data-source-page/data-source-website/config-jina-reader-modal.tsx

@@ -0,0 +1,140 @@
+'use client'
+import type { FC } from 'react'
+import React, { useCallback, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import {
+  PortalToFollowElem,
+  PortalToFollowElemContent,
+} from '@/app/components/base/portal-to-follow-elem'
+import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
+import Button from '@/app/components/base/button'
+import { DataSourceProvider } from '@/models/common'
+import Field from '@/app/components/datasets/create/website/base/field'
+import Toast from '@/app/components/base/toast'
+import { createDataSourceApiKeyBinding } from '@/service/datasets'
+import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
+type Props = {
+  onCancel: () => void
+  onSaved: () => void
+}
+
+const I18N_PREFIX = 'datasetCreation.jinaReader'
+
+const ConfigJinaReaderModal: FC<Props> = ({
+  onCancel,
+  onSaved,
+}) => {
+  const { t } = useTranslation()
+  const [isSaving, setIsSaving] = useState(false)
+  const [apiKey, setApiKey] = useState('')
+
+  const handleSave = useCallback(async () => {
+    if (isSaving)
+      return
+    let errorMsg = ''
+    if (!errorMsg) {
+      if (!apiKey) {
+        errorMsg = t('common.errorMsg.fieldRequired', {
+          field: 'API Key',
+        })
+      }
+    }
+
+    if (errorMsg) {
+      Toast.notify({
+        type: 'error',
+        message: errorMsg,
+      })
+      return
+    }
+    const postData = {
+      category: 'website',
+      provider: DataSourceProvider.jinaReader,
+      credentials: {
+        auth_type: 'bearer',
+        config: {
+          api_key: apiKey,
+        },
+      },
+    }
+    try {
+      setIsSaving(true)
+      await createDataSourceApiKeyBinding(postData)
+      Toast.notify({
+        type: 'success',
+        message: t('common.api.success'),
+      })
+    }
+    finally {
+      setIsSaving(false)
+    }
+
+    onSaved()
+  }, [apiKey, onSaved, t, isSaving])
+
+  return (
+    <PortalToFollowElem open>
+      <PortalToFollowElemContent className='w-full h-full z-[60]'>
+        <div className='fixed inset-0 flex items-center justify-center bg-black/[.25]'>
+          <div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-white shadow-xl rounded-2xl overflow-y-auto'>
+            <div className='px-8 pt-8'>
+              <div className='flex justify-between items-center mb-4'>
+                <div className='text-xl font-semibold text-gray-900'>{t(`${I18N_PREFIX}.configJinaReader`)}</div>
+              </div>
+
+              <div className='space-y-4'>
+                <Field
+                  label='API Key'
+                  labelClassName='!text-sm'
+                  isRequired
+                  value={apiKey}
+                  onChange={(value: string | number) => setApiKey(value as string)}
+                  placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!}
+                />
+              </div>
+              <div className='my-8 flex justify-between items-center h-8'>
+                <a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-[#155EEF]' target='_blank' href='https://jina.ai/reader/'>
+                  <span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span>
+                  <LinkExternal02 className='w-3 h-3' />
+                </a>
+                <div className='flex'>
+                  <Button
+                    size='large'
+                    className='mr-2'
+                    onClick={onCancel}
+                  >
+                    {t('common.operation.cancel')}
+                  </Button>
+                  <Button
+                    variant='primary'
+                    size='large'
+                    onClick={handleSave}
+                    loading={isSaving}
+                  >
+                    {t('common.operation.save')}
+                  </Button>
+                </div>
+
+              </div>
+            </div>
+            <div className='border-t-[0.5px] border-t-black/5'>
+              <div className='flex justify-center items-center py-3 bg-gray-50 text-xs text-gray-500'>
+                <Lock01 className='mr-1 w-3 h-3 text-gray-500' />
+                {t('common.modelProvider.encrypted.front')}
+                <a
+                  className='text-primary-600 mx-1'
+                  target='_blank' rel='noopener noreferrer'
+                  href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html'
+                >
+                  PKCS1_OAEP
+                </a>
+                {t('common.modelProvider.encrypted.back')}
+              </div>
+            </div>
+          </div>
+        </div>
+      </PortalToFollowElemContent>
+    </PortalToFollowElem>
+  )
+}
+export default React.memo(ConfigJinaReaderModal)

+ 35 - 16
web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx

@@ -2,11 +2,12 @@
 import type { FC } from 'react'
 import React, { useCallback, useEffect, useState } from 'react'
 import { useTranslation } from 'react-i18next'
-import { useBoolean } from 'ahooks'
 import Panel from '../panel'
 import { DataSourceType } from '../panel/types'
 import ConfigFirecrawlModal from './config-firecrawl-modal'
+import ConfigJinaReaderModal from './config-jina-reader-modal'
 import cn from '@/utils/classnames'
+import s from '@/app/components/datasets/create/website/index.module.css'
 import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets'
 
 import type {
@@ -19,9 +20,11 @@ import {
 } from '@/models/common'
 import Toast from '@/app/components/base/toast'
 
-type Props = {}
+type Props = {
+  provider: DataSourceProvider
+}
 
-const DataSourceWebsite: FC<Props> = () => {
+const DataSourceWebsite: FC<Props> = ({ provider }) => {
   const { t } = useTranslation()
   const { isCurrentWorkspaceManager } = useAppContext()
   const [sources, setSources] = useState<DataSourceItem[]>([])
@@ -36,22 +39,26 @@ const DataSourceWebsite: FC<Props> = () => {
     // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [])
 
-  const [isShowConfig, {
-    setTrue: showConfig,
-    setFalse: hideConfig,
-  }] = useBoolean(false)
+  const [configTarget, setConfigTarget] = useState<DataSourceProvider | null>(null)
+  const showConfig = useCallback((provider: DataSourceProvider) => {
+    setConfigTarget(provider)
+  }, [setConfigTarget])
+
+  const hideConfig = useCallback(() => {
+    setConfigTarget(null)
+  }, [setConfigTarget])
 
   const handleAdded = useCallback(() => {
     checkSetApiKey()
     hideConfig()
   }, [checkSetApiKey, hideConfig])
 
-  const getIdByProvider = (provider: string): string | undefined => {
+  const getIdByProvider = (provider: DataSourceProvider): string | undefined => {
     const source = sources.find(item => item.provider === provider)
     return source?.id
   }
 
-  const handleRemove = useCallback((provider: string) => {
+  const handleRemove = useCallback((provider: DataSourceProvider) => {
     return async () => {
       const dataSourceId = getIdByProvider(provider)
       if (dataSourceId) {
@@ -69,22 +76,34 @@ const DataSourceWebsite: FC<Props> = () => {
     <>
       <Panel
         type={DataSourceType.website}
-        isConfigured={sources.length > 0}
-        onConfigure={showConfig}
+        provider={provider}
+        isConfigured={sources.find(item => item.provider === provider) !== undefined}
+        onConfigure={() => showConfig(provider)}
         readOnly={!isCurrentWorkspaceManager}
-        configuredList={sources.map(item => ({
+        configuredList={sources.filter(item => item.provider === provider).map(item => ({
           id: item.id,
           logo: ({ className }: { className: string }) => (
-            <div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>🔥</div>
+            item.provider === DataSourceProvider.fireCrawl
+              ? (
+                <div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>🔥</div>
+              )
+              : (
+                <div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>
+                  <span className={s.jinaLogo} />
+                </div>
+              )
           ),
-          name: 'Firecrawl',
+          name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader',
           isActive: true,
         }))}
-        onRemove={handleRemove(DataSourceProvider.fireCrawl)}
+        onRemove={handleRemove(provider)}
       />
-      {isShowConfig && (
+      {configTarget === DataSourceProvider.fireCrawl && (
         <ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} />
       )}
+      {configTarget === DataSourceProvider.jinaReader && (
+        <ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} />
+      )}
     </>
 
   )

+ 3 - 1
web/app/components/header/account-setting/data-source-page/index.tsx

@@ -3,6 +3,7 @@ import { useTranslation } from 'react-i18next'
 import DataSourceNotion from './data-source-notion'
 import DataSourceWebsite from './data-source-website'
 import { fetchDataSource } from '@/service/common'
+import { DataSourceProvider } from '@/models/common'
 
 export default function DataSourcePage() {
   const { t } = useTranslation()
@@ -13,7 +14,8 @@ export default function DataSourcePage() {
     <div className='mb-8'>
       <div className='mb-2 text-sm font-medium text-gray-900'>{t('common.dataSource.add')}</div>
       <DataSourceNotion workspaces={notionWorkspaces} />
-      <DataSourceWebsite />
+      <DataSourceWebsite provider={DataSourceProvider.jinaReader} />
+      <DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
     </div>
   )
 }

+ 4 - 1
web/app/components/header/account-setting/data-source-page/panel/index.tsx

@@ -8,10 +8,12 @@ import ConfigItem from './config-item'
 
 import s from './style.module.css'
 import { DataSourceType } from './types'
+import { DataSourceProvider } from '@/models/common'
 import cn from '@/utils/classnames'
 
 type Props = {
   type: DataSourceType
+  provider: DataSourceProvider
   isConfigured: boolean
   onConfigure: () => void
   readOnly: boolean
@@ -25,6 +27,7 @@ type Props = {
 
 const Panel: FC<Props> = ({
   type,
+  provider,
   isConfigured,
   onConfigure,
   readOnly,
@@ -46,7 +49,7 @@ const Panel: FC<Props> = ({
             <div className='text-sm font-medium text-gray-800'>{t(`common.dataSource.${type}.title`)}</div>
             {isWebsite && (
               <div className='ml-1 leading-[18px] px-1.5 rounded-md bg-white border border-gray-100 text-xs font-medium text-gray-700'>
-                <span className='text-gray-500'>{t('common.dataSource.website.with')}</span> 🔥 Firecrawl
+                <span className='text-gray-500'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'}
               </div>
             )}
           </div>

+ 14 - 1
web/i18n/en-US/dataset-creation.ts

@@ -16,6 +16,11 @@ const translation = {
     apiKeyPlaceholder: 'API key from firecrawl.dev',
     getApiKeyLinkText: 'Get your API key from firecrawl.dev',
   },
+  jinaReader: {
+    configJinaReader: 'Configure Jina Reader',
+    apiKeyPlaceholder: 'API key from jina.ai',
+    getApiKeyLinkText: 'Get your free API key at jina.ai',
+  },
   stepOne: {
     filePreview: 'File Preview',
     pagePreview: 'Page Preview',
@@ -56,13 +61,21 @@ const translation = {
       failed: 'Creation failed',
     },
     website: {
+      chooseProvider: 'Select a provider',
       fireCrawlNotConfigured: 'Firecrawl is not configured',
       fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.',
+      jinaReaderNotConfigured: 'Jina Reader is not configured',
+      jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.',
       configure: 'Configure',
       run: 'Run',
       firecrawlTitle: 'Extract web content with 🔥Firecrawl',
       firecrawlDoc: 'Firecrawl docs',
       firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
+      jinaReaderTitle: 'Convert the entire site to Markdown',
+      jinaReaderDoc: 'Learn more about Jina Reader',
+      jinaReaderDocLink: 'https://jina.ai/reader',
+      useSitemap: 'Use sitemap',
+      useSitemapTooltip: 'Follow the sitemap to crawl the site. If not, Jina Reader will crawl iteratively based on page relevance, yielding fewer but higher-quality pages.',
       options: 'Options',
       crawlSubPage: 'Crawl sub-pages',
       limit: 'Limit',
@@ -70,7 +83,7 @@ const translation = {
       excludePaths: 'Exclude paths',
       includeOnlyPaths: 'Include only paths',
       extractOnlyMainContent: 'Extract only main content (no headers, navs, footers, etc.)',
-      exceptionErrorTitle: 'An exception occurred while running Firecrawl job:',
+      exceptionErrorTitle: 'An exception occurred while running crawling job:',
       unknownError: 'Unknown error',
       totalPageScraped: 'Total pages scraped:',
       selectAll: 'Select All',

+ 14 - 1
web/i18n/zh-Hans/dataset-creation.ts

@@ -16,6 +16,11 @@ const translation = {
     apiKeyPlaceholder: '从 firecrawl.dev 获取 API Key',
     getApiKeyLinkText: '从 firecrawl.dev 获取您的 API Key',
   },
+  jinaReader: {
+    configJinaReader: '配置 Jina Reader',
+    apiKeyPlaceholder: '从 jina.ai 获取 API Key',
+    getApiKeyLinkText: '从 jina.ai 获取您的免费 API Key',
+  },
   stepOne: {
     filePreview: '文件预览',
     pagePreview: '页面预览',
@@ -56,13 +61,21 @@ const translation = {
       failed: '创建失败',
     },
     website: {
+      chooseProvider: '选择工具',
       fireCrawlNotConfigured: 'Firecrawl 未配置',
       fireCrawlNotConfiguredDescription: '请配置 Firecrawl 的 API 密钥以使用它。',
+      jinaReaderNotConfigured: 'Jina Reader 未配置',
+      jinaReaderNotConfiguredDescription: '请配置 Jina Reader 的免费 API 密钥以访问它。',
       configure: '配置',
       run: '运行',
       firecrawlTitle: '使用 🔥Firecrawl 提取网页内容',
       firecrawlDoc: 'Firecrawl 文档',
       firecrawlDocLink: 'https://docs.dify.ai/v/zh-hans/guides/knowledge-base/sync-from-website',
+      jinaReaderTitle: '将整个站点内容转换为 Markdown 格式',
+      jinaReaderDoc: '了解更多关于 Jina Reader',
+      jinaReaderDocLink: 'https://jina.ai/reader',
+      useSitemap: '使用 sitemap',
+      useSitemapTooltip: '根据 sitemap 爬取站点。否则,Jina Reader 将基于页面相关性迭代爬取,抓取较少的页面,但质量更高。',
       options: '选项',
       crawlSubPage: '爬取子页面',
       limit: '限制数量',
@@ -70,7 +83,7 @@ const translation = {
       excludePaths: '排除路径',
       includeOnlyPaths: '仅包含路径',
       extractOnlyMainContent: '仅提取主要内容(无标题、导航、页脚等)',
-      exceptionErrorTitle: '运行 Firecrawl 时发生异常:',
+      exceptionErrorTitle: '运行时发生异常:',
       unknownError: '未知错误',
       totalPageScraped: '抓取页面总数:',
       selectAll: '全选',

+ 1 - 0
web/models/common.ts

@@ -177,6 +177,7 @@ export enum DataSourceCategory {
 }
 export enum DataSourceProvider {
   fireCrawl = 'firecrawl',
+  jinaReader = 'jinareader',
 }
 
 export type FirecrawlConfig = {

+ 1 - 0
web/models/datasets.ts

@@ -49,6 +49,7 @@ export type CrawlOptions = {
   excludes: string
   limit: number | string
   max_depth: number | string
+  use_sitemap: boolean
 }
 
 export type CrawlResultItem = {

+ 22 - 3
web/service/datasets.ts

@@ -23,7 +23,7 @@ import type {
   SegmentsResponse,
   createDocumentResponse,
 } from '@/models/datasets'
-import type { CommonResponse, DataSourceNotionWorkspace } from '@/models/common'
+import { type CommonResponse, type DataSourceNotionWorkspace, DataSourceProvider } from '@/models/common'
 import type {
   ApiKeysListResponse,
   CreateApiKeyResponse,
@@ -253,7 +253,7 @@ export const createFirecrawlTask: Fetcher<CommonResponse, Record<string, any>> =
   return post<CommonResponse>('website/crawl', {
     body: {
       ...body,
-      provider: 'firecrawl',
+      provider: DataSourceProvider.fireCrawl,
     },
   })
 }
@@ -261,7 +261,26 @@ export const createFirecrawlTask: Fetcher<CommonResponse, Record<string, any>> =
 export const checkFirecrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
   return get<CommonResponse>(`website/crawl/status/${jobId}`, {
     params: {
-      provider: 'firecrawl',
+      provider: DataSourceProvider.fireCrawl,
+    },
+  }, {
+    silent: true,
+  })
+}
+
+export const createJinaReaderTask: Fetcher<CommonResponse, Record<string, any>> = (body) => {
+  return post<CommonResponse>('website/crawl', {
+    body: {
+      ...body,
+      provider: DataSourceProvider.jinaReader,
+    },
+  })
+}
+
+export const checkJinaReaderTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
+  return get<CommonResponse>(`website/crawl/status/${jobId}`, {
+    params: {
+      provider: 'jinareader',
     },
   }, {
     silent: true,