Prechádzať zdrojové kódy

fix #6902 .docx handles images within tables and handles cross-column tables (#6951)

yanghx 8 mesiacov pred
rodič
commit
c53875ce8c
1 zmenil súbory, kde vykonal 55 pridanie a 11 odobranie
  1. 55 11
      api/core/rag/extractor/word_extractor.py

+ 55 - 11
api/core/rag/extractor/word_extractor.py

@@ -117,19 +117,63 @@ class WordExtractor(BaseExtractor):
 
         return image_map
 
-    def _table_to_markdown(self, table):
-        markdown = ""
-        # deal with table headers
+    def _table_to_markdown(self, table, image_map):
+        markdown = []
+        # calculate the total number of columns
+        total_cols = max(len(row.cells) for row in table.rows)
+
         header_row = table.rows[0]
-        headers = [cell.text for cell in header_row.cells]
-        markdown += "| " + " | ".join(headers) + " |\n"
-        markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
-        # deal with table rows
+        headers = self._parse_row(header_row, image_map, total_cols)
+        markdown.append("| " + " | ".join(headers) + " |")
+        markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
+
         for row in table.rows[1:]:
-            row_cells = [cell.text for cell in row.cells]
-            markdown += "| " + " | ".join(row_cells) + " |\n"
+            row_cells = self._parse_row(row, image_map, total_cols)
+            markdown.append("| " + " | ".join(row_cells) + " |")
+        return "\n".join(markdown)
+
+    def _parse_row(self, row, image_map, total_cols):
+        # Initialize a row, all of which are empty by default
+        row_cells = [""] * total_cols
+        col_index = 0
+        for cell in row.cells:
+            # make sure the col_index is not out of range
+            while col_index < total_cols and row_cells[col_index] != "":
+                col_index += 1
+            # if col_index is out of range the loop is jumped
+            if col_index >= total_cols:
+                break
+            cell_content = self._parse_cell(cell, image_map).strip()
+            cell_colspan = cell.grid_span if cell.grid_span else 1
+            for i in range(cell_colspan):
+                if col_index + i < total_cols:
+                    row_cells[col_index + i] = cell_content if i == 0 else ""
+            col_index += cell_colspan
+        return row_cells
+
+    def _parse_cell(self, cell, image_map):
+        cell_content = []
+        for paragraph in cell.paragraphs:
+            parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
+            if parsed_paragraph:
+                cell_content.append(parsed_paragraph)
+        unique_content = list(dict.fromkeys(cell_content))
+        return " ".join(unique_content)
+
+    def _parse_cell_paragraph(self, paragraph, image_map):
+        paragraph_content = []
+        for run in paragraph.runs:
+            if run.element.xpath('.//a:blip'):
+                for blip in run.element.xpath('.//a:blip'):
+                    image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
+                    image_part = paragraph.part.rels[image_id].target_part
 
-        return markdown
+                    if image_part in image_map:
+                        image_link = image_map[image_part]
+                        paragraph_content.append(image_link)
+            else:
+                paragraph_content.append(run.text)
+        return "".join(paragraph_content).strip()
 
     def _parse_paragraph(self, paragraph, image_map):
         paragraph_content = []
@@ -183,6 +227,6 @@ class WordExtractor(BaseExtractor):
                     content.append(parsed_paragraph)
             elif element.tag.endswith('tbl'):  # table
                 table = tables.pop(0)
-                content.append(self._table_to_markdown(table))
+                content.append(self._table_to_markdown(table,image_map))
         return '\n'.join(content)