瀏覽代碼

extract docx filter comment element (#7092)

Jyong 8 月之前
父節點
當前提交
12095f8cd6
共有 1 個文件被更改,包括 10 次插入9 次删除
  1. 10 9
      api/core/rag/extractor/word_extractor.py

+ 10 - 9
api/core/rag/extractor/word_extractor.py

@@ -228,7 +228,7 @@ class WordExtractor(BaseExtractor):
         def parse_paragraph(paragraph):
             paragraph_content = []
             for run in paragraph.runs:
-                if run.element.tag.endswith('r'):
+                if hasattr(run.element, 'tag') and isinstance(element.tag, str) and run.element.tag.endswith('r'):
                     drawing_elements = run.element.findall(
                         './/{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
                     for drawing in drawing_elements:
@@ -248,13 +248,14 @@ class WordExtractor(BaseExtractor):
         paragraphs = doc.paragraphs.copy()
         tables = doc.tables.copy()
         for element in doc.element.body:
-            if element.tag.endswith('p'):  # paragraph
-                para = paragraphs.pop(0)
-                parsed_paragraph = parse_paragraph(para)
-                if parsed_paragraph:
-                    content.append(parsed_paragraph)
-            elif element.tag.endswith('tbl'):  # table
-                table = tables.pop(0)
-                content.append(self._table_to_markdown(table,image_map))
+            if hasattr(element, 'tag'):
+                if isinstance(element.tag, str) and element.tag.endswith('p'):  # paragraph
+                    para = paragraphs.pop(0)
+                    parsed_paragraph = parse_paragraph(para)
+                    if parsed_paragraph:
+                        content.append(parsed_paragraph)
+                elif isinstance(element.tag, str) and element.tag.endswith('tbl'):  # table
+                    table = tables.pop(0)
+                    content.append(self._table_to_markdown(table,image_map))
         return '\n'.join(content)