Skip to content

Commit 150bf98

Browse files
committed
Support disabling external file access
1 parent 603f8b8 commit 150bf98

7 files changed

Lines changed: 73 additions & 14 deletions

File tree

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
* Fix conversion of unmerged table cells.
66

7+
* Support disabling external file accesses using the external_file_access argument.
8+
79
# 1.10.0
810

911
* Add "Heading" and "Body" styles, as found in documents created by Apple Pages,

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,16 @@ pass `include_default_style_map=False`:
178178
result = mammoth.convert_to_html(docx_file, style_map=style_map, include_default_style_map=False)
179179
```
180180

181+
#### External file access
182+
183+
Source documents may reference files outside of the source document.
184+
To disable access to any such external files during the conversion process,
185+
pass `external_file_access=False`:
186+
187+
```javascript
188+
result = mammoth.convert_to_html(docx_file, external_file_access=False)
189+
```
190+
181191
#### Custom image handlers
182192

183193
By default, images are converted to `<img>` elements with the source included inline in the `src` attribute.
@@ -415,6 +425,9 @@ For instance:
415425
and embed the HTML into your website,
416426
this may allow arbitrary files on the server to be read and exfiltrated.
417427

428+
To disable access to any such external files during the conversion process,
429+
pass `external_file_access=False`.
430+
418431
### Document transforms
419432

420433
**The API for document transforms should be considered unstable,

mammoth/__init__.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,36 @@ def convert_to_markdown(*args, **kwargs):
1616
return convert(*args, output_format="markdown", **kwargs)
1717

1818

19-
def convert(fileobj, transform_document=None, id_prefix=None, include_embedded_style_map=_undefined, **kwargs):
19+
def convert(
20+
fileobj,
21+
transform_document=None,
22+
id_prefix=None,
23+
include_embedded_style_map=_undefined,
24+
external_file_access=_undefined,
25+
**kwargs
26+
):
2027
if include_embedded_style_map is _undefined:
2128
include_embedded_style_map = True
29+
2230
if transform_document is None:
2331
transform_document = lambda x: x
32+
2433
if include_embedded_style_map:
2534
kwargs["embedded_style_map"] = read_style_map(fileobj)
35+
36+
if external_file_access is _undefined:
37+
external_file_access = True
38+
2639
return options.read_options(kwargs).bind(lambda convert_options:
27-
docx.read(fileobj).map(transform_document).bind(lambda document:
40+
docx.read(fileobj, external_file_access=external_file_access).map(transform_document).bind(lambda document:
2841
conversion.convert_document_element_to_html(
2942
document,
3043
id_prefix=id_prefix,
3144
**convert_options
3245
)
3346
)
3447
)
35-
48+
3649

3750
def extract_raw_text(fileobj):
3851
return docx.read(fileobj).map(extract_raw_text_from_element)

mammoth/docx/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
_empty_result = results.success([])
2020

2121

22-
def read(fileobj):
22+
def read(fileobj, external_file_access=False):
2323
zip_file = open_zip(fileobj, "r")
2424
part_paths = _find_part_paths(zip_file)
2525
read_part_with_body = _part_with_body_reader(
2626
getattr(fileobj, "name", None),
2727
zip_file,
2828
part_paths=part_paths,
29+
external_file_access=external_file_access,
2930
)
3031

3132
return results.combine([
@@ -134,7 +135,7 @@ def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
134135
)
135136

136137

137-
def _part_with_body_reader(document_path, zip_file, part_paths):
138+
def _part_with_body_reader(document_path, zip_file, part_paths, external_file_access):
138139
content_types = _try_read_entry_or_default(
139140
zip_file,
140141
"[Content_Types].xml",
@@ -156,7 +157,10 @@ def _part_with_body_reader(document_path, zip_file, part_paths):
156157
default=Numbering.EMPTY,
157158
)
158159

159-
files = Files(None if document_path is None else os.path.dirname(document_path))
160+
files = Files(
161+
None if document_path is None else os.path.dirname(document_path),
162+
external_file_access=external_file_access,
163+
)
160164

161165
def read_part(name, reader, default=_undefined):
162166
relationships = _read_relationships(zip_file, _find_relationships_path_for(name))

mammoth/docx/files.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,16 @@
1111

1212

1313
class Files(object):
14-
def __init__(self, base):
14+
def __init__(self, base, external_file_access):
1515
self._base = base
16-
16+
self._external_file_access = external_file_access
17+
1718
def open(self, uri):
19+
if not self._external_file_access:
20+
raise ExternalFileAccessIsDisabledError(
21+
"could not open external image '{0}', external file access is disabled".format(uri)
22+
)
23+
1824
try:
1925
if _is_absolute(uri):
2026
return contextlib.closing(urlopen(uri))
@@ -34,3 +40,7 @@ def _is_absolute(url):
3440

3541
class InvalidFileReferenceError(ValueError):
3642
pass
43+
44+
45+
class ExternalFileAccessIsDisabledError(InvalidFileReferenceError):
46+
pass

tests/docx/files_tests.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
1-
from mammoth.docx.files import Files, InvalidFileReferenceError
1+
from mammoth.docx.files import ExternalFileAccessIsDisabledError, Files, InvalidFileReferenceError
22
from ..testing import generate_test_path, assert_equal, assert_raises
33

44

5+
def test_when_external_file_access_is_disabled_then_opening_file_raises_error():
6+
files = Files(None, external_file_access=False)
7+
error = assert_raises(ExternalFileAccessIsDisabledError, lambda: files.open("/tmp/image.png"))
8+
expected_message = (
9+
"could not open external image '/tmp/image.png', external file access is disabled"
10+
)
11+
assert_equal(expected_message, str(error))
12+
13+
514
def test_can_open_files_with_file_uri():
615
path = generate_test_path("tiny-picture.png")
7-
files = Files(None)
16+
files = Files(None, external_file_access=True)
817
with files.open("file:///" + path) as image_file:
918
contents = image_file.read()
1019
assert_equal(bytes, type(contents))
@@ -13,7 +22,7 @@ def test_can_open_files_with_file_uri():
1322

1423

1524
def test_can_open_files_with_relative_uri():
16-
files = Files(generate_test_path(""))
25+
files = Files(generate_test_path(""), external_file_access=True)
1726
with files.open("tiny-picture.png") as image_file:
1827
contents = image_file.read()
1928
assert_equal(bytes, type(contents))
@@ -22,7 +31,7 @@ def test_can_open_files_with_relative_uri():
2231

2332

2433
def test_given_base_is_not_set_when_opening_relative_uri_then_error_is_raised():
25-
files = Files(None)
34+
files = Files(None, external_file_access=True)
2635
error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png"))
2736
expected_message = (
2837
"could not find external image 'not-a-real-file.png', fileobj has no name"
@@ -31,7 +40,7 @@ def test_given_base_is_not_set_when_opening_relative_uri_then_error_is_raised():
3140

3241

3342
def test_error_is_raised_if_relative_uri_cannot_be_opened():
34-
files = Files("/tmp")
43+
files = Files("/tmp", external_file_access=True)
3544
error = assert_raises(InvalidFileReferenceError, lambda: files.open("not-a-real-file.png"))
3645
expected_message = (
3746
"could not open external image: 'not-a-real-file.png' (document directory: '/tmp')\n" +
@@ -41,7 +50,7 @@ def test_error_is_raised_if_relative_uri_cannot_be_opened():
4150

4251

4352
def test_error_is_raised_if_file_uri_cannot_be_opened():
44-
files = Files("/tmp")
53+
files = Files("/tmp", external_file_access=True)
4554
error = assert_raises(InvalidFileReferenceError, lambda: files.open("file:///not-a-real-file.png"))
4655
expected_message = "could not open external image: 'file:///not-a-real-file.png' (document directory: '/tmp')\n"
4756
assert str(error).startswith(expected_message)

tests/mammoth_tests.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@ def test_warn_if_images_stored_outside_of_document_are_specified_when_passing_fi
127127
assert_equal([results.warning("could not find external image 'tiny-picture.png', fileobj has no name")], result.messages)
128128

129129

130+
def test_warn_if_images_stored_outside_of_document_are_specified_when_external_file_access_is_disabled():
131+
with open(generate_test_path("external-picture.docx"), "rb") as fileobj:
132+
result = mammoth.convert_to_html(fileobj=fileobj, external_file_access=False)
133+
134+
assert_equal("", result.value)
135+
assert_equal([results.warning("could not open external image 'tiny-picture.png', external file access is disabled")], result.messages)
136+
137+
130138
def test_warn_if_images_stored_outside_of_document_are_not_found():
131139
with tempman.create_temp_dir() as temp_dir:
132140
document_path = os.path.join(temp_dir.path, "document.docx")

0 commit comments

Comments
 (0)