Made it possible to pass multiple files to pandoc (#259)

Nicklas Tegner · web-flow · commit 02f94233d3d3 · 2022-03-22T06:12:54.000+01:00
diff --git a/README.md b/README.md
@@ -139,6 +139,23 @@ output = pypandoc.convert_file('somefile.md', 'docx', outputfile="somefile.docx"
 assert output == ""
 ```
 
+
+It's also possible to specify multiple input files to pandoc, either as absolute paths, relative paths or file patterns.
+
+```python
+import pypandoc
+
+# convert all markdown files in a chapters/ subdirectory.
+pypandoc.convert_file('chapters/*.md', 'docx', outputfile="somefile.docx")
+
+# convert all markdown files in the book1 and book2 directories.
+pypandoc.convert_file(['book1/*.md', 'book2/*.md'], 'docx', outputfile="somefile.docx")
+
+# convert the front from another drive, and all markdown files in the chapter directory.
+pypandoc.convert_file(['D:/book_front.md', 'book2/*.md'], 'docx', outputfile="somefile.docx")
+```
+
+
 In addition to `format`, it is possible to pass `extra_args`.
 That makes it possible to access various pandoc options easily.
 
diff --git a/pypandoc/__init__.py b/pypandoc/__init__.py
@@ -10,7 +10,7 @@
 import sys
 import tempfile
 import textwrap
-from turtle import TurtleScreenBase
+import glob
 
 from .handler import _check_log_handler
 from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc
@@ -68,12 +68,12 @@ def convert_text(source:str, to:str, format:str, extra_args:Iterable=(), encodin
                           cworkdir=cworkdir)
 
 
-def convert_file(source_file:str, to:str, format:Union[str, None]=None, extra_args:Iterable=(), encoding:str='utf-8',
+def convert_file(source_file:Union[list, str], to:str, format:Union[str, None]=None, extra_args:Iterable=(), encoding:str='utf-8',
                  outputfile:Union[None, str]=None, filters:Union[Iterable, None]=None, verify_format:bool=True,
                  sandbox:bool=True, cworkdir:Union[str, None]=None) -> str:
     """Converts given `source` from `format` to `to`.
 
-    :param str source_file: file path (see encoding)
+    :param (str, list) source_file: Either a full file path, relative file path, a file patterh (like dir/*.md), or a list if file or file patterns.
 
     :param str to: format into which the input should be converted; can be one of
             `pypandoc.get_pandoc_formats()[1]`
@@ -107,14 +107,39 @@ def convert_file(source_file:str, to:str, format:Union[str, None]=None, extra_ar
     """
     if not _identify_path(source_file):
         raise RuntimeError("source_file is not a valid path")
-    format = _identify_format_from_path(source_file, format)
-    return _convert_input(source_file, format, 'path', to, extra_args=extra_args,
+    if _is_network_path(source_file): # if the source_file is an url
+        format = _identify_format_from_path(source_file, format)
+        return _convert_input(source_file, format, 'path', to, extra_args=extra_args,
                           outputfile=outputfile, filters=filters,
                           verify_format=verify_format, sandbox=sandbox,
                           cworkdir=cworkdir)
 
+    discovered_source_files = []
+    if isinstance(source_file, str):
+        discovered_source_files += glob.glob(source_file)
+    if isinstance(source_file, list): # a list of possibly file or file patterns. Expand all with glob
+        for filepath in source_file:
+            discovered_source_files.extend(glob.glob(filepath))
+    if len(discovered_source_files) == 1: # behavior for a single file or a pattern
+        format = _identify_format_from_path(discovered_source_files[0], format)
+        return _convert_input(discovered_source_files[0], format, 'path', to, extra_args=extra_args,
+                          outputfile=outputfile, filters=filters,
+                          verify_format=verify_format, sandbox=sandbox,
+                          cworkdir=cworkdir)
+    else: # behavior for multiple  files or file patterns
+        format = _identify_format_from_path(discovered_source_files[0], format)
+        return _convert_input(discovered_source_files, format, 'path', to, extra_args=extra_args,
+                          outputfile=outputfile, filters=filters,
+                          verify_format=verify_format, sandbox=sandbox,
+                          cworkdir=cworkdir)
 
-def _identify_path(source:str) -> bool:
+
+def _identify_path(source) -> bool:
+    if isinstance(source, list):
+        for single_source in source:
+            if not _identify_path(single_source):
+                return False
+        return True
     is_path = False
     try:
         is_path = os.path.exists(source)
@@ -124,6 +149,15 @@ def _identify_path(source:str) -> bool:
         # still false
         pass
 
+    if not is_path:
+        try:
+            is_path = len(glob.glob(source)) >= 1
+        except UnicodeEncodeError:
+            is_path = len(glob.glob(source.encode('utf-8'))) >= 1
+        except:  # noqa
+            # still false
+            pass
+    
     if not is_path:
         try:
             # check if it's an URL
@@ -140,6 +174,21 @@ def _identify_path(source:str) -> bool:
 
     return is_path
 
+def _is_network_path(source):
+    try:
+        # check if it's an URL
+        result = urlparse(source)
+        if result.scheme in ["http", "https"]:
+            return True
+        elif result.scheme and result.netloc and result.path:
+            # complete uri including one with a network path
+            return True
+        elif result.scheme == "file" and result.path:
+            return os.path.exists(url2path(source))
+    except AttributeError:
+        pass
+    return False
+
 
 def _identify_format_from_path(sourcefile:str, format:str) -> str:
     return format or os.path.splitext(sourcefile)[1].strip('.')
@@ -242,7 +291,13 @@ def _convert_input(source, format, input_type, to, extra_args=(),
         to = normalize_format(to)
 
     string_input = input_type == 'string'
-    input_file = [source] if not string_input else []
+    if not string_input:
+        if isinstance(source, str):
+            input_file = [source]
+        else:
+            input_file = source
+    else:
+        input_file = []
     args = [__pandoc_path, '--from=' + format]
 
     args.append('--to=' + to)
@@ -294,11 +349,12 @@ def _convert_input(source, format, input_type, to, extra_args=(),
                                                                            p.stderr.read())
         )
 
-    try:
-        source = cast_bytes(source, encoding='utf-8')
-    except (UnicodeDecodeError, UnicodeEncodeError):
-        # assume that it is already a utf-8 encoded string
-        pass
+    if string_input:
+        try:
+            source = cast_bytes(source, encoding='utf-8')
+        except (UnicodeDecodeError, UnicodeEncodeError):
+            # assume that it is already a utf-8 encoded string
+            pass
     try:
         stdout, stderr = p.communicate(source if string_input else None)
     except OSError:
diff --git a/tests.py b/tests.py
@@ -193,6 +193,25 @@ def test_basic_conversion_from_file(self):
             received = pypandoc.convert_file(file_name, 'rst')
             self.assertEqualExceptForNewlineEnd(expected, received)
 
+    def test_basic_conversion_from_multiple_files(self):
+        with closed_tempfile('.md', text='some title') as file_name1:
+            with closed_tempfile('.md', text='some title') as file_name2:
+                expected = '<p>some title</p>\n<p>some title</p>'
+                received = pypandoc.convert_file([file_name1,file_name2], 'html')
+                self.assertEqualExceptForNewlineEnd(expected, received)
+
+    def test_basic_conversion_from_file_pattern(self):
+        received = pypandoc.convert_file("./*.md", 'html')
+        received = received.lower()
+        assert "making a release" in received
+        assert "pypandoc provides a thin wrapper" in received
+
+    def test_basic_conversion_from_file_pattern_with_input_list(self):
+        received = pypandoc.convert_file(["./*.md", "./*.md"], 'html')
+        received = received.lower()
+        assert "making a release" in received
+        assert "pypandoc provides a thin wrapper" in received
+
     @unittest.skipIf(sys.platform.startswith("win"), "File based urls do not work on windows: "
                                                      "https://github.com/jgm/pandoc/issues/4613")
     def test_basic_conversion_from_file_url(self):