Просмотр исходного кода

fix: document truncation and loss in notion document sync (#5631)

Co-authored-by: Aurelius Huang <cm.huang@aftership.com>
Aurelius Huang 9 месяцев назад
Родитель
Сommit
f546db5437
1 измененных файлов с 15 добавлено и 16 удалено
  1. 15 16
      api/core/rag/extractor/notion_extractor.py

+ 15 - 16
api/core/rag/extractor/notion_extractor.py

@@ -140,11 +140,10 @@ class NotionExtractor(BaseExtractor):
 
     def _get_notion_block_data(self, page_id: str) -> list[str]:
         result_lines_arr = []
-        cur_block_id = page_id
+        start_cursor = None
+        block_url = BLOCK_CHILD_URL_TMPL.format(block_id=page_id)
         while True:
-            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
-            query_dict: dict[str, Any] = {}
-
+            query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
             res = requests.request(
                 "GET",
                 block_url,
@@ -153,7 +152,7 @@ class NotionExtractor(BaseExtractor):
                     "Content-Type": "application/json",
                     "Notion-Version": "2022-06-28",
                 },
-                json=query_dict
+                params=query_dict
             )
             data = res.json()
             for result in data["results"]:
@@ -191,16 +190,16 @@ class NotionExtractor(BaseExtractor):
             if data["next_cursor"] is None:
                 break
             else:
-                cur_block_id = data["next_cursor"]
+                start_cursor = data["next_cursor"]
         return result_lines_arr
 
     def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
         """Read a block."""
         result_lines_arr = []
-        cur_block_id = block_id
+        start_cursor = None
+        block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
         while True:
-            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
-            query_dict: dict[str, Any] = {}
+            query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
 
             res = requests.request(
                 "GET",
@@ -210,7 +209,7 @@ class NotionExtractor(BaseExtractor):
                     "Content-Type": "application/json",
                     "Notion-Version": "2022-06-28",
                 },
-                json=query_dict
+                params=query_dict
             )
             data = res.json()
             if 'results' not in data or data["results"] is None:
@@ -249,7 +248,7 @@ class NotionExtractor(BaseExtractor):
             if data["next_cursor"] is None:
                 break
             else:
-                cur_block_id = data["next_cursor"]
+                start_cursor = data["next_cursor"]
 
         result_lines = "\n".join(result_lines_arr)
         return result_lines
@@ -258,10 +257,10 @@ class NotionExtractor(BaseExtractor):
         """Read table rows."""
         done = False
         result_lines_arr = []
-        cur_block_id = block_id
+        start_cursor = None
+        block_url = BLOCK_CHILD_URL_TMPL.format(block_id=block_id)
         while not done:
-            block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
-            query_dict: dict[str, Any] = {}
+            query_dict: dict[str, Any] = {} if not start_cursor else {'start_cursor': start_cursor}
 
             res = requests.request(
                 "GET",
@@ -271,7 +270,7 @@ class NotionExtractor(BaseExtractor):
                     "Content-Type": "application/json",
                     "Notion-Version": "2022-06-28",
                 },
-                json=query_dict
+                params=query_dict
             )
             data = res.json()
             # get table headers text
@@ -300,7 +299,7 @@ class NotionExtractor(BaseExtractor):
                 done = True
                 break
             else:
-                cur_block_id = data["next_cursor"]
+                start_cursor = data["next_cursor"]
 
         result_lines = "\n".join(result_lines_arr)
         return result_lines