|
@@ -117,19 +117,63 @@ class WordExtractor(BaseExtractor):
|
|
|
|
|
|
return image_map
|
|
|
|
|
|
- def _table_to_markdown(self, table):
|
|
|
- markdown = ""
|
|
|
- # deal with table headers
|
|
|
+ def _table_to_markdown(self, table, image_map):
|
|
|
+ markdown = []
|
|
|
+ # calculate the total number of columns
|
|
|
+ total_cols = max(len(row.cells) for row in table.rows)
|
|
|
+
|
|
|
header_row = table.rows[0]
|
|
|
- headers = [cell.text for cell in header_row.cells]
|
|
|
- markdown += "| " + " | ".join(headers) + " |\n"
|
|
|
- markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
|
|
|
- # deal with table rows
|
|
|
+ headers = self._parse_row(header_row, image_map, total_cols)
|
|
|
+ markdown.append("| " + " | ".join(headers) + " |")
|
|
|
+ markdown.append("| " + " | ".join(["---"] * total_cols) + " |")
|
|
|
+
|
|
|
for row in table.rows[1:]:
|
|
|
- row_cells = [cell.text for cell in row.cells]
|
|
|
- markdown += "| " + " | ".join(row_cells) + " |\n"
|
|
|
+ row_cells = self._parse_row(row, image_map, total_cols)
|
|
|
+ markdown.append("| " + " | ".join(row_cells) + " |")
|
|
|
+ return "\n".join(markdown)
|
|
|
+
|
|
|
+ def _parse_row(self, row, image_map, total_cols):
|
|
|
+ # Initialize a row, all of which are empty by default
|
|
|
+ row_cells = [""] * total_cols
|
|
|
+ col_index = 0
|
|
|
+ for cell in row.cells:
|
|
|
+ # make sure the col_index is not out of range
|
|
|
+ while col_index < total_cols and row_cells[col_index] != "":
|
|
|
+ col_index += 1
|
|
|
+ # if col_index is out of range the loop is jumped
|
|
|
+ if col_index >= total_cols:
|
|
|
+ break
|
|
|
+ cell_content = self._parse_cell(cell, image_map).strip()
|
|
|
+ cell_colspan = cell.grid_span if cell.grid_span else 1
|
|
|
+ for i in range(cell_colspan):
|
|
|
+ if col_index + i < total_cols:
|
|
|
+ row_cells[col_index + i] = cell_content if i == 0 else ""
|
|
|
+ col_index += cell_colspan
|
|
|
+ return row_cells
|
|
|
+
|
|
|
+ def _parse_cell(self, cell, image_map):
|
|
|
+ cell_content = []
|
|
|
+ for paragraph in cell.paragraphs:
|
|
|
+ parsed_paragraph = self._parse_cell_paragraph(paragraph, image_map)
|
|
|
+ if parsed_paragraph:
|
|
|
+ cell_content.append(parsed_paragraph)
|
|
|
+ unique_content = list(dict.fromkeys(cell_content))
|
|
|
+ return " ".join(unique_content)
|
|
|
+
|
|
|
+ def _parse_cell_paragraph(self, paragraph, image_map):
|
|
|
+ paragraph_content = []
|
|
|
+ for run in paragraph.runs:
|
|
|
+ if run.element.xpath('.//a:blip'):
|
|
|
+ for blip in run.element.xpath('.//a:blip'):
|
|
|
+ image_id = blip.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed")
|
|
|
+ image_part = paragraph.part.rels[image_id].target_part
|
|
|
|
|
|
- return markdown
|
|
|
+ if image_part in image_map:
|
|
|
+ image_link = image_map[image_part]
|
|
|
+ paragraph_content.append(image_link)
|
|
|
+ else:
|
|
|
+ paragraph_content.append(run.text)
|
|
|
+ return "".join(paragraph_content).strip()
|
|
|
|
|
|
def _parse_paragraph(self, paragraph, image_map):
|
|
|
paragraph_content = []
|
|
@@ -183,6 +227,6 @@ class WordExtractor(BaseExtractor):
|
|
|
content.append(parsed_paragraph)
|
|
|
elif element.tag.endswith('tbl'): # table
|
|
|
table = tables.pop(0)
|
|
|
- content.append(self._table_to_markdown(table))
|
|
|
+ content.append(self._table_to_markdown(table,image_map))
|
|
|
return '\n'.join(content)
|
|
|
|