Skip to content

Commit 02f9423

Browse files
author
Nicklas Tegner
authored
Made it possible to pass multiple files to pandoc (#259)
1 parent 2cfcc1e commit 02f9423

File tree

3 files changed

+104
-12
lines changed

3 files changed

+104
-12
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,23 @@ output = pypandoc.convert_file('somefile.md', 'docx', outputfile="somefile.docx"
139139
assert output == ""
140140
```
141141

142+
143+
It's also possible to specify multiple input files to pandoc, either as absolute paths, relative paths or file patterns.
144+
145+
```python
146+
import pypandoc
147+
148+
# convert all markdown files in a chapters/ subdirectory.
149+
pypandoc.convert_file('chapters/*.md', 'docx', outputfile="somefile.docx")
150+
151+
# convert all markdown files in the book1 and book2 directories.
152+
pypandoc.convert_file(['book1/*.md', 'book2/*.md'], 'docx', outputfile="somefile.docx")
153+
154+
# convert the front from another drive, and all markdown files in the chapter directory.
155+
pypandoc.convert_file(['D:/book_front.md', 'book2/*.md'], 'docx', outputfile="somefile.docx")
156+
```
157+
158+
142159
In addition to `format`, it is possible to pass `extra_args`.
143160
That makes it possible to access various pandoc options easily.
144161

pypandoc/__init__.py

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import sys
1111
import tempfile
1212
import textwrap
13-
from turtle import TurtleScreenBase
13+
import glob
1414

1515
from .handler import _check_log_handler
1616
from .pandoc_download import DEFAULT_TARGET_FOLDER, download_pandoc
@@ -68,12 +68,12 @@ def convert_text(source:str, to:str, format:str, extra_args:Iterable=(), encodin
6868
cworkdir=cworkdir)
6969

7070

71-
def convert_file(source_file:str, to:str, format:Union[str, None]=None, extra_args:Iterable=(), encoding:str='utf-8',
71+
def convert_file(source_file:Union[list, str], to:str, format:Union[str, None]=None, extra_args:Iterable=(), encoding:str='utf-8',
7272
outputfile:Union[None, str]=None, filters:Union[Iterable, None]=None, verify_format:bool=True,
7373
sandbox:bool=True, cworkdir:Union[str, None]=None) -> str:
7474
"""Converts given `source` from `format` to `to`.
7575
76-
:param str source_file: file path (see encoding)
76+
:param (str, list) source_file: Either a full file path, relative file path, a file patterh (like dir/*.md), or a list if file or file patterns.
7777
7878
:param str to: format into which the input should be converted; can be one of
7979
`pypandoc.get_pandoc_formats()[1]`
@@ -107,14 +107,39 @@ def convert_file(source_file:str, to:str, format:Union[str, None]=None, extra_ar
107107
"""
108108
if not _identify_path(source_file):
109109
raise RuntimeError("source_file is not a valid path")
110-
format = _identify_format_from_path(source_file, format)
111-
return _convert_input(source_file, format, 'path', to, extra_args=extra_args,
110+
if _is_network_path(source_file): # if the source_file is an url
111+
format = _identify_format_from_path(source_file, format)
112+
return _convert_input(source_file, format, 'path', to, extra_args=extra_args,
112113
outputfile=outputfile, filters=filters,
113114
verify_format=verify_format, sandbox=sandbox,
114115
cworkdir=cworkdir)
115116

117+
discovered_source_files = []
118+
if isinstance(source_file, str):
119+
discovered_source_files += glob.glob(source_file)
120+
if isinstance(source_file, list): # a list of possibly file or file patterns. Expand all with glob
121+
for filepath in source_file:
122+
discovered_source_files.extend(glob.glob(filepath))
123+
if len(discovered_source_files) == 1: # behavior for a single file or a pattern
124+
format = _identify_format_from_path(discovered_source_files[0], format)
125+
return _convert_input(discovered_source_files[0], format, 'path', to, extra_args=extra_args,
126+
outputfile=outputfile, filters=filters,
127+
verify_format=verify_format, sandbox=sandbox,
128+
cworkdir=cworkdir)
129+
else: # behavior for multiple files or file patterns
130+
format = _identify_format_from_path(discovered_source_files[0], format)
131+
return _convert_input(discovered_source_files, format, 'path', to, extra_args=extra_args,
132+
outputfile=outputfile, filters=filters,
133+
verify_format=verify_format, sandbox=sandbox,
134+
cworkdir=cworkdir)
116135

117-
def _identify_path(source:str) -> bool:
136+
137+
def _identify_path(source) -> bool:
138+
if isinstance(source, list):
139+
for single_source in source:
140+
if not _identify_path(single_source):
141+
return False
142+
return True
118143
is_path = False
119144
try:
120145
is_path = os.path.exists(source)
@@ -124,6 +149,15 @@ def _identify_path(source:str) -> bool:
124149
# still false
125150
pass
126151

152+
if not is_path:
153+
try:
154+
is_path = len(glob.glob(source)) >= 1
155+
except UnicodeEncodeError:
156+
is_path = len(glob.glob(source.encode('utf-8'))) >= 1
157+
except: # noqa
158+
# still false
159+
pass
160+
127161
if not is_path:
128162
try:
129163
# check if it's an URL
@@ -140,6 +174,21 @@ def _identify_path(source:str) -> bool:
140174

141175
return is_path
142176

177+
def _is_network_path(source):
178+
try:
179+
# check if it's an URL
180+
result = urlparse(source)
181+
if result.scheme in ["http", "https"]:
182+
return True
183+
elif result.scheme and result.netloc and result.path:
184+
# complete uri including one with a network path
185+
return True
186+
elif result.scheme == "file" and result.path:
187+
return os.path.exists(url2path(source))
188+
except AttributeError:
189+
pass
190+
return False
191+
143192

144193
def _identify_format_from_path(sourcefile:str, format:str) -> str:
145194
return format or os.path.splitext(sourcefile)[1].strip('.')
@@ -242,7 +291,13 @@ def _convert_input(source, format, input_type, to, extra_args=(),
242291
to = normalize_format(to)
243292

244293
string_input = input_type == 'string'
245-
input_file = [source] if not string_input else []
294+
if not string_input:
295+
if isinstance(source, str):
296+
input_file = [source]
297+
else:
298+
input_file = source
299+
else:
300+
input_file = []
246301
args = [__pandoc_path, '--from=' + format]
247302

248303
args.append('--to=' + to)
@@ -294,11 +349,12 @@ def _convert_input(source, format, input_type, to, extra_args=(),
294349
p.stderr.read())
295350
)
296351

297-
try:
298-
source = cast_bytes(source, encoding='utf-8')
299-
except (UnicodeDecodeError, UnicodeEncodeError):
300-
# assume that it is already a utf-8 encoded string
301-
pass
352+
if string_input:
353+
try:
354+
source = cast_bytes(source, encoding='utf-8')
355+
except (UnicodeDecodeError, UnicodeEncodeError):
356+
# assume that it is already a utf-8 encoded string
357+
pass
302358
try:
303359
stdout, stderr = p.communicate(source if string_input else None)
304360
except OSError:

tests.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,25 @@ def test_basic_conversion_from_file(self):
193193
received = pypandoc.convert_file(file_name, 'rst')
194194
self.assertEqualExceptForNewlineEnd(expected, received)
195195

196+
def test_basic_conversion_from_multiple_files(self):
197+
with closed_tempfile('.md', text='some title') as file_name1:
198+
with closed_tempfile('.md', text='some title') as file_name2:
199+
expected = '<p>some title</p>\n<p>some title</p>'
200+
received = pypandoc.convert_file([file_name1,file_name2], 'html')
201+
self.assertEqualExceptForNewlineEnd(expected, received)
202+
203+
def test_basic_conversion_from_file_pattern(self):
204+
received = pypandoc.convert_file("./*.md", 'html')
205+
received = received.lower()
206+
assert "making a release" in received
207+
assert "pypandoc provides a thin wrapper" in received
208+
209+
def test_basic_conversion_from_file_pattern_with_input_list(self):
210+
received = pypandoc.convert_file(["./*.md", "./*.md"], 'html')
211+
received = received.lower()
212+
assert "making a release" in received
213+
assert "pypandoc provides a thin wrapper" in received
214+
196215
@unittest.skipIf(sys.platform.startswith("win"), "File based urls do not work on windows: "
197216
"https://github.com/jgm/pandoc/issues/4613")
198217
def test_basic_conversion_from_file_url(self):

0 commit comments

Comments
 (0)