|
31 | 31 | ".doc": (UnstructuredWordDocumentLoader, {}),
|
32 | 32 | ".docx": (UnstructuredWordDocumentLoader, {}),
|
33 | 33 | ".enex": (EverNoteLoader, {}),
|
34 |
| - ".eml": (UnstructuredEmailLoader, {}), |
| 34 | + ".eml": (MyElmLoader, {}), |
35 | 35 | ".epub": (UnstructuredEPubLoader, {}),
|
36 | 36 | ".html": (UnstructuredHTMLLoader, {}),
|
37 | 37 | ".md": (UnstructuredMarkdownLoader, {}),
|
|
47 | 47 | load_dotenv()
|
48 | 48 |
|
49 | 49 |
|
| 50 | +class MyElmLoader(UnstructuredEmailLoader): |
| 51 | + """Wrapper to fallback to text/plain when default does not work""" |
| 52 | + |
| 53 | + def load(self) -> List[Document]: |
| 54 | + """Wrapper adding fallback for elm without html""" |
| 55 | + try: |
| 56 | + doc = UnstructuredEmailLoader.load() |
| 57 | + except ValueError as e: |
| 58 | + if 'text/html content not found in email' in str(e): |
| 59 | + # Try plain text |
| 60 | + self.unstructured_kwargs["content_source"]="text/plain" |
| 61 | + doc = UnstructuredEmailLoader.load() |
| 62 | + else: |
| 63 | + raise |
| 64 | + |
| 65 | + return doc |
| 66 | + |
| 67 | + |
50 | 68 | def load_single_document(file_path: str) -> Document:
|
51 | 69 | ext = "." + file_path.rsplit(".", 1)[-1]
|
52 | 70 | if ext in LOADER_MAPPING:
|
|
0 commit comments