123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- """Markdown parser.
- Contains parser for md files.
- """
- import re
- from pathlib import Path
- from typing import Any, Dict, List, Optional, Tuple, Union, cast
- from llama_index.readers.file.base_parser import BaseParser
- class MarkdownParser(BaseParser):
- """Markdown parser.
- Extract text from markdown files.
- Returns dictionary with keys as headers and values as the text between headers.
- """
- def __init__(
- self,
- *args: Any,
- remove_hyperlinks: bool = True,
- remove_images: bool = True,
- **kwargs: Any,
- ) -> None:
- """Init params."""
- super().__init__(*args, **kwargs)
- self._remove_hyperlinks = remove_hyperlinks
- self._remove_images = remove_images
- def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
- """Convert a markdown file to a dictionary.
- The keys are the headers and the values are the text under each header.
- """
- markdown_tups: List[Tuple[Optional[str], str]] = []
- lines = markdown_text.split("\n")
- current_header = None
- current_text = ""
- for line in lines:
- header_match = re.match(r"^#+\s", line)
- if header_match:
- if current_header is not None:
- markdown_tups.append((current_header, current_text))
- current_header = line
- current_text = ""
- else:
- current_text += line + "\n"
- markdown_tups.append((current_header, current_text))
- if current_header is not None:
- # pass linting, assert keys are defined
- markdown_tups = [
- (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
- for key, value in markdown_tups
- ]
- else:
- markdown_tups = [
- (key, re.sub("\n", "", value)) for key, value in markdown_tups
- ]
- return markdown_tups
- def remove_images(self, content: str) -> str:
- """Get a dictionary of a markdown file from its path."""
- pattern = r"!{1}\[\[(.*)\]\]"
- content = re.sub(pattern, "", content)
- return content
- def remove_hyperlinks(self, content: str) -> str:
- """Get a dictionary of a markdown file from its path."""
- pattern = r"\[(.*?)\]\((.*?)\)"
- content = re.sub(pattern, r"\1", content)
- return content
- def _init_parser(self) -> Dict:
- """Initialize the parser with the config."""
- return {}
- def parse_tups(
- self, filepath: Path, errors: str = "ignore"
- ) -> List[Tuple[Optional[str], str]]:
- """Parse file into tuples."""
- with open(filepath, "r", encoding="utf-8") as f:
- content = f.read()
- if self._remove_hyperlinks:
- content = self.remove_hyperlinks(content)
- if self._remove_images:
- content = self.remove_images(content)
- markdown_tups = self.markdown_to_tups(content)
- return markdown_tups
- def parse_file(
- self, filepath: Path, errors: str = "ignore"
- ) -> Union[str, List[str]]:
- """Parse file into string."""
- tups = self.parse_tups(filepath, errors=errors)
- results = []
- # TODO: don't include headers right now
- for header, value in tups:
- if header is None:
- results.append(value)
- else:
- results.append(f"\n\n{header}\n{value}")
- return results
|