diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..19d75c4b5 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -39,6 +39,8 @@ EpubConverter, DocumentIntelligenceConverter, CsvConverter, + ConfluenceConverter, + ConfluenceStorageConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -188,7 +190,9 @@ def enable_builtins(self, **kwargs) -> None: HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT ) self.register_converter(RssConverter()) + self.register_converter(ConfluenceStorageConverter()) self.register_converter(WikipediaConverter()) + self.register_converter(ConfluenceConverter()) self.register_converter(YouTubeConverter()) self.register_converter(BingSerpConverter()) self.register_converter(DocxConverter()) diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..56bd5d5cb 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -23,6 +23,7 @@ ) from ._epub_converter import EpubConverter from ._csv_converter import CsvConverter +from ._confluence_converter import ConfluenceConverter, ConfluenceStorageConverter __all__ = [ "PlainTextConverter", @@ -45,4 +46,6 @@ "DocumentIntelligenceFileType", "EpubConverter", "CsvConverter", + "ConfluenceConverter", + "ConfluenceStorageConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_confluence_converter.py b/packages/markitdown/src/markitdown/converters/_confluence_converter.py new file mode 100644 index 000000000..b7832105e --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_confluence_converter.py @@ -0,0 +1,425 @@ +import re +import warnings +from typing import Any, BinaryIO + +import bs4 +from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo +from ._markdownify import _CustomMarkdownify + +# URL patterns for Confluence Cloud and Server +_CONFLUENCE_CLOUD_RE = re.compile( + r"^https?://[^/]+\.atlassian\.net/wiki/", re.IGNORECASE +) +_CONFLUENCE_SERVER_RE = re.compile( + r"^https?://[^/]+/wiki/spaces/", re.IGNORECASE +) + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "text/html", + "application/xhtml", +] + +ACCEPTED_FILE_EXTENSIONS = [ + ".html", + ".htm", +] + +# Noise elements to remove from the rendered Confluence page +_CONFLUENCE_NOISE_SELECTORS = [ + "#navigation", + "#breadcrumbs", + "#header", + "#footer", + ".confluence-navigation", + ".page-metadata", + ".page-metadata-modification-info", + ".page-metadata-secondary", + "#likes-and-labels-container", + "#children-section", + "#comments-section", + ".wiki-content .plugin_pagetree", + ".confluence-information-macro-icon", + "#sidebar", + ".ia-fixed-sidebar", + ".ia-splitter-left", +] + +# Confluence macro names that are purely navigational (no content value) +_MACRO_SKIP = { + "toc", "recently-updated", "children", "pagetree", "space-index", "anchor", +} + +# Macro names that are informational panels → blockquote with a label +_MACRO_PANEL = { + "info": "Info", + "note": "Note", + "warning": "Warning", + "tip": "Tip", + "panel": None, + "excerpt-include": None, +} + + +class ConfluenceConverter(DocumentConverter): + """Convert live Confluence Cloud / Server pages to Markdown. + + Triggered when the URL matches a known Confluence pattern. The converter + strips navigation chrome and focuses on the main content area, similar to + how WikipediaConverter handles Wikipedia pages. + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + url = stream_info.url or "" + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if not ( + _CONFLUENCE_CLOUD_RE.search(url) or _CONFLUENCE_SERVER_RE.search(url) + ): + return False + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) + + # Remove scripts, styles, and Confluence UI chrome + for tag in soup(["script", "style"]): + tag.extract() + for selector in _CONFLUENCE_NOISE_SELECTORS: + for el in soup.select(selector): + el.extract() + + # Try to find the main content container + content = ( + soup.find("div", {"id": "main-content"}) # Confluence Cloud + or soup.find("div", {"id": "content"}) # Confluence Server + or soup.find("div", {"class": "wiki-content"}) # fallback + or soup.find("body") + or soup + ) + + # Extract title + title: str | None = None + title_el = soup.find("h1", {"id": "title-text"}) or soup.find( + "title" + ) + if title_el and isinstance(title_el, bs4.Tag): + title = title_el.get_text(strip=True) + + assert isinstance(content, bs4.PageElement) + markdown = _CustomMarkdownify(**kwargs).convert_soup(content).strip() + + return DocumentConverterResult(markdown=markdown, title=title) + + +class ConfluenceStorageConverter(DocumentConverter): + """Convert Confluence Storage Format XML files to Markdown. + + Confluence exports pages in an XHTML-based storage format that uses + ``ac:`` and ``ri:`` namespaced tags for macros and resource identifiers. + This converter pre-processes those tags into plain HTML before running + the standard markdownify pass. + """ + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + extension = (stream_info.extension or "").lower() + mimetype = (stream_info.mimetype or "").lower() + + is_xml = extension == ".xml" or any( + mimetype.startswith(p) + for p in ("text/xml", "application/xml", "application/xhtml+xml") + ) + if not is_xml: + return False + + # Peek into the stream to confirm Confluence namespaces are present + cur_pos = file_stream.tell() + try: + sample = file_stream.read(2048) + if isinstance(sample, bytes): + sample = sample.decode("utf-8", errors="replace") + return " DocumentConverterResult: + encoding = "utf-8" if stream_info.charset is None else stream_info.charset + raw = file_stream.read() + if isinstance(raw, bytes): + raw = raw.decode(encoding, errors="replace") + + # Strip the XML processing instruction so it doesn't bleed into output + # when falling back to html.parser (lxml-xml handles it natively). + raw = re.sub(r"<\?xml[^?]*\?>", "", raw, count=1).lstrip() + + # Use lxml-xml parser when available for proper namespace handling; + # fall back to html.parser which is always present (suppress the + # XMLParsedAsHTMLWarning that bs4 emits in that case). + try: + soup = BeautifulSoup(raw, "lxml-xml") + except Exception: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) + soup = BeautifulSoup(raw, "html.parser") + + # Extract page title from the root element if present + title: str | None = None + title_el = soup.find("title") or soup.find("page") + if title_el and isinstance(title_el, bs4.Tag): + t = title_el.get("title") or title_el.find("title") + if isinstance(t, str): + title = t + elif isinstance(t, bs4.Tag): + title = t.get_text(strip=True) + + self._transform_macros(soup) + + markdown = _CustomMarkdownify(**kwargs).convert_soup(soup).strip() + return DocumentConverterResult(markdown=markdown, title=title) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _transform_macros(self, soup: BeautifulSoup) -> None: + """Mutate *soup* in-place, translating Confluence-specific tags.""" + self._transform_structured_macros(soup) + self._transform_links(soup) + self._transform_images(soup) + self._transform_emoticons(soup) + self._transform_tasks(soup) + + def _transform_structured_macros(self, soup: BeautifulSoup) -> None: + """Translate ```` tags to plain HTML equivalents.""" + for macro in soup.find_all( + re.compile(r"^ac:structured-macro$", re.IGNORECASE) + ): + if not isinstance(macro, bs4.Tag): + continue + name = (macro.get("ac:name") or "").lower() + + if name in _MACRO_SKIP: + macro.decompose() + continue + + if name == "code": + self._transform_code_macro(macro) + elif name == "excerpt": + # Unwrap — keep the body content + body = macro.find(re.compile(r"^ac:rich-text-body$", re.IGNORECASE)) + if body and isinstance(body, bs4.Tag): + macro.replace_with(body) + else: + macro.decompose() + elif name in _MACRO_PANEL: + self._transform_panel_macro(macro, name) + elif name == "status": + self._transform_status_macro(macro) + elif name == "jira": + self._transform_jira_macro(macro) + else: + # Unknown macro: keep the rich-text-body content if present, + # otherwise discard the entire macro tag. + body = macro.find(re.compile(r"^ac:rich-text-body$", re.IGNORECASE)) + if body and isinstance(body, bs4.Tag): + macro.replace_with(body) + else: + macro.decompose() + + def _transform_code_macro(self, macro: bs4.Tag) -> None: + """Replace a ``code`` macro with a fenced ``
`` block."""
+        language = ""
+        for param in macro.find_all(
+            re.compile(r"^ac:parameter$", re.IGNORECASE)
+        ):
+            if isinstance(param, bs4.Tag) and (
+                param.get("ac:name") or ""
+            ).lower() == "language":
+                language = param.get_text(strip=True)
+                break
+
+        body = macro.find(
+            re.compile(r"^ac:plain-text-body$", re.IGNORECASE)
+        ) or macro.find(re.compile(r"^ac:rich-text-body$", re.IGNORECASE))
+
+        code_text = body.get_text() if body and isinstance(body, bs4.Tag) else ""
+
+        lang_attr = f' class="language-{language}"' if language else ""
+        replacement = BeautifulSoup(
+            f"
{code_text}
", "html.parser" + ) + macro.replace_with(replacement) + + def _transform_panel_macro(self, macro: bs4.Tag, name: str) -> None: + """Replace info/note/warning/tip/panel macros with a blockquote.""" + label = _MACRO_PANEL.get(name) + + body = macro.find(re.compile(r"^ac:rich-text-body$", re.IGNORECASE)) + if not body or not isinstance(body, bs4.Tag): + macro.decompose() + return + + # Build a simple
wrapper + inner_html = str(body.decode_contents()) + if label: + prefix = f"

{label}:

" + else: + prefix = "" + replacement = BeautifulSoup( + f"
{prefix}{inner_html}
", "html.parser" + ) + macro.replace_with(replacement) + + def _transform_status_macro(self, macro: bs4.Tag) -> None: + """Replace a ``status`` macro with a bold badge, e.g. **[DONE]**.""" + title = "" + for param in macro.find_all(re.compile(r"^ac:parameter$", re.IGNORECASE)): + if isinstance(param, bs4.Tag) and ( + param.get("ac:name") or "" + ).lower() == "title": + title = param.get_text(strip=True) + break + if title: + replacement = BeautifulSoup( + f"[{title}]", "html.parser" + ) + macro.replace_with(replacement) + else: + macro.decompose() + + def _transform_jira_macro(self, macro: bs4.Tag) -> None: + """Replace a ``jira`` macro with the issue key as plain text.""" + key = "" + for param in macro.find_all(re.compile(r"^ac:parameter$", re.IGNORECASE)): + if isinstance(param, bs4.Tag) and ( + param.get("ac:name") or "" + ).lower() == "key": + key = param.get_text(strip=True) + break + if key: + macro.replace_with(key) + else: + macro.decompose() + + def _transform_links(self, soup: BeautifulSoup) -> None: + """Translate ```` tags to plain text or anchor tags.""" + for link in soup.find_all(re.compile(r"^ac:link$", re.IGNORECASE)): + if not isinstance(link, bs4.Tag): + continue + + # Try to get a human-readable label from the link body + link_body = link.find(re.compile(r"^ac:link-body$", re.IGNORECASE)) + if link_body and isinstance(link_body, bs4.Tag): + link.replace_with(link_body.get_text()) + continue + + # Try to get the page title from ri:page + ri_page = link.find(re.compile(r"^ri:page$", re.IGNORECASE)) + if ri_page and isinstance(ri_page, bs4.Tag): + page_title = ri_page.get("ri:content-title") or ri_page.get( + "ri:space-key", "" + ) + link.replace_with(str(page_title)) + continue + + link.decompose() + + def _transform_images(self, soup: BeautifulSoup) -> None: + """Translate ```` tags to standard ```` tags.""" + for img in soup.find_all(re.compile(r"^ac:image$", re.IGNORECASE)): + if not isinstance(img, bs4.Tag): + continue + + # Prefer ri:url, then ri:attachment filename + ri_url = img.find(re.compile(r"^ri:url$", re.IGNORECASE)) + ri_att = img.find(re.compile(r"^ri:attachment$", re.IGNORECASE)) + + src = "" + alt = "" + if ri_url and isinstance(ri_url, bs4.Tag): + src = ri_url.get("ri:value") or "" + alt = src + elif ri_att and isinstance(ri_att, bs4.Tag): + src = ri_att.get("ri:filename") or "" + alt = src + + replacement = BeautifulSoup( + f'{alt}', "html.parser" + ) + img.replace_with(replacement) + + def _transform_emoticons(self, soup: BeautifulSoup) -> None: + """Remove ```` tags (no Markdown equivalent).""" + for el in soup.find_all(re.compile(r"^ac:emoticon$", re.IGNORECASE)): + el.decompose() + + def _transform_tasks(self, soup: BeautifulSoup) -> None: + """Translate Confluence task lists to GitHub-flavoured checkboxes.""" + for task_list in soup.find_all( + re.compile(r"^ac:task-list$", re.IGNORECASE) + ): + if not isinstance(task_list, bs4.Tag): + continue + ul = BeautifulSoup("
    ", "html.parser").find("ul") + assert ul is not None and isinstance(ul, bs4.Tag) + for task in task_list.find_all( + re.compile(r"^ac:task$", re.IGNORECASE) + ): + if not isinstance(task, bs4.Tag): + continue + status_el = task.find( + re.compile(r"^ac:task-status$", re.IGNORECASE) + ) + body_el = task.find( + re.compile(r"^ac:task-body$", re.IGNORECASE) + ) + status = ( + status_el.get_text(strip=True).lower() + if status_el and isinstance(status_el, bs4.Tag) + else "incomplete" + ) + body_html = ( + str(body_el.decode_contents()) + if body_el and isinstance(body_el, bs4.Tag) + else "" + ) + checked = "checked" if status == "complete" else "" + li = BeautifulSoup( + f"
  • {body_html}
  • ", + "html.parser", + ).find("li") + if li: + ul.append(li) + task_list.replace_with(ul) diff --git a/packages/markitdown/tests/test_confluence.py b/packages/markitdown/tests/test_confluence.py new file mode 100644 index 000000000..03cb70a64 --- /dev/null +++ b/packages/markitdown/tests/test_confluence.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for the Confluence converters (URL-based and Storage Format XML).""" + +import io +import os + +from markitdown import MarkItDown, StreamInfo +from markitdown.converters import ConfluenceConverter, ConfluenceStorageConverter + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + +# --------------------------------------------------------------------------- +# Expected strings for the HTML fixture (Confluence Cloud page) +# --------------------------------------------------------------------------- +CLOUD_MUST_CONTAIN = [ + "Design Decisions", # title / h1 + "## Overview", # h2 preserved + "## Goals", + "## Decision Log", + "**architecture**", # bold preserved + "Simplicity", # list item + "Reliability", + "Extensibility", + "Use Python", # table content + "Use REST API", + "Accepted", +] + +CLOUD_MUST_NOT_CONTAIN = [ + "Header chrome", + "Footer chrome", + "Sidebar content", + "12 likes", + "A comment that should not appear.", + "Home", # breadcrumb/nav link text +] + +# --------------------------------------------------------------------------- +# Expected strings for the XML fixture (Confluence Storage Format) +# --------------------------------------------------------------------------- +STORAGE_MUST_CONTAIN = [ + "API Guidelines", # page title / h1 + "def get_user", # code block body + "user_id", + "Info:", # info panel label + "/v1/", # info panel body + "Note:", # note panel label + "Rate limiting", + "Warning:", # warning panel label + "Never expose internal IDs", + "Authentication Docs", # ac:link resolved to page title + "architecture-diagram.png", # ac:image resolved to filename + "Write API spec", # completed task + "Add pagination support", # incomplete task + "[x]", # checked checkbox + "[ ]", # unchecked checkbox +] + +STORAGE_MUST_NOT_CONTAIN = [ + "ac:structured-macro", # raw Confluence XML must not leak through + "ac:task-list", + "ac:emoticon", + "ri:attachment", + # TOC macro should be fully removed — no heading containing "toc" + ">toc<", +] + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + +def _get_text(result) -> str: + return result.text_content.replace("\\", "") + + +# --------------------------------------------------------------------------- +# ConfluenceConverter (URL / HTML) +# --------------------------------------------------------------------------- + +class TestConfluenceConverter: + + def test_accepts_cloud_url_with_html_mimetype(self): + conv = ConfluenceConverter() + si = StreamInfo( + url="https://mycompany.atlassian.net/wiki/spaces/ENG/pages/1/Title", + mimetype="text/html", + ) + assert conv.accepts(io.BytesIO(b""), si) + + def test_accepts_server_url_with_html_mimetype(self): + conv = ConfluenceConverter() + si = StreamInfo( + url="https://confluence.example.com/wiki/spaces/PROJ/display/Page", + mimetype="text/html", + ) + assert conv.accepts(io.BytesIO(b""), si) + + def test_accepts_cloud_url_with_html_extension(self): + conv = ConfluenceConverter() + si = StreamInfo( + url="https://mycompany.atlassian.net/wiki/spaces/ENG/pages/1/Title", + extension=".html", + ) + assert conv.accepts(io.BytesIO(b""), si) + + def test_rejects_non_confluence_url(self): + conv = ConfluenceConverter() + for url in [ + "https://en.wikipedia.org/wiki/Python", + "https://www.google.com", + "https://mycompany.atlassian.net/jira/browse/TICKET-1", # Jira, not Confluence + ]: + si = StreamInfo(url=url, mimetype="text/html") + assert not conv.accepts(io.BytesIO(b""), si), f"should not accept: {url}" + + def test_rejects_confluence_url_without_html_content(self): + conv = ConfluenceConverter() + si = StreamInfo( + url="https://mycompany.atlassian.net/wiki/spaces/ENG/pages/1/file.pdf", + mimetype="application/pdf", + ) + assert not conv.accepts(io.BytesIO(b""), si) + + def test_convert_strips_noise_and_preserves_content(self): + conv = ConfluenceConverter() + fixture = os.path.join(TEST_FILES_DIR, "test_confluence_cloud.html") + si = StreamInfo( + url="https://mycompany.atlassian.net/wiki/spaces/ENG/pages/1/Design", + mimetype="text/html", + extension=".html", + ) + with open(fixture, "rb") as f: + result = conv.convert(f, si) + + text = _get_text(result) + for s in CLOUD_MUST_CONTAIN: + assert s in text, f"expected {s!r} in output" + for s in CLOUD_MUST_NOT_CONTAIN: + assert s not in text, f"expected {s!r} to be stripped from output" + + def test_convert_extracts_title(self): + conv = ConfluenceConverter() + fixture = os.path.join(TEST_FILES_DIR, "test_confluence_cloud.html") + si = StreamInfo( + url="https://mycompany.atlassian.net/wiki/spaces/ENG/pages/1/Design", + mimetype="text/html", + extension=".html", + ) + with open(fixture, "rb") as f: + result = conv.convert(f, si) + assert result.title is not None + assert "Design Decisions" in result.title + + def test_markitdown_routes_confluence_cloud_url(self): + """MarkItDown should pick ConfluenceConverter over HtmlConverter for Confluence URLs.""" + markitdown = MarkItDown() + fixture = os.path.join(TEST_FILES_DIR, "test_confluence_cloud.html") + si = StreamInfo( + url="https://mycompany.atlassian.net/wiki/spaces/ENG/pages/1/Design", + mimetype="text/html", + extension=".html", + ) + with open(fixture, "rb") as f: + result = markitdown.convert_stream(f, stream_info=si) + + text = _get_text(result) + for s in CLOUD_MUST_CONTAIN: + assert s in text, f"expected {s!r} in output" + for s in CLOUD_MUST_NOT_CONTAIN: + assert s not in text, f"expected {s!r} to be stripped" + + +# --------------------------------------------------------------------------- +# ConfluenceStorageConverter (XML / Storage Format) +# --------------------------------------------------------------------------- + +class TestConfluenceStorageConverter: + + def test_accepts_xml_with_ac_namespace(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b"" + assert conv.accepts(io.BytesIO(xml), si) + + def test_accepts_xml_with_ri_namespace(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b"" + assert conv.accepts(io.BytesIO(xml), si) + + def test_rejects_plain_xml(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b"no confluence tags here" + assert not conv.accepts(io.BytesIO(xml), si) + + def test_rejects_non_xml_extension(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".html", mimetype="text/html") + xml = b"" + assert not conv.accepts(io.BytesIO(xml), si) + + def test_accepts_does_not_advance_stream(self): + """Stream position must be reset after accepts() peeks.""" + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b"" + stream = io.BytesIO(xml) + conv.accepts(stream, si) + assert stream.tell() == 0, "accepts() must reset stream position" + + def test_code_macro_becomes_fenced_block(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b""" + + + python + print("hello") + +""" + result = conv.convert(io.BytesIO(xml), si) + assert 'print("hello")' in result.markdown + assert "ac:structured-macro" not in result.markdown + + def test_code_macro_no_language(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b""" + + + SELECT * FROM users; + +""" + result = conv.convert(io.BytesIO(xml), si) + assert "SELECT * FROM users;" in result.markdown + + def test_panel_macros_become_blockquotes(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + for macro_name, expected_label in [ + ("info", "Info:"), + ("note", "Note:"), + ("warning", "Warning:"), + ("tip", "Tip:"), + ]: + xml = f""" + + +

    Panel content {macro_name}

    +
    +
    """.encode() + result = conv.convert(io.BytesIO(xml), si) + assert expected_label in result.markdown, f"label missing for {macro_name}" + assert f"Panel content {macro_name}" in result.markdown + + def test_toc_macro_is_removed(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b""" + +

    Before TOC

    + +

    After TOC

    +
    """ + result = conv.convert(io.BytesIO(xml), si) + assert "Before TOC" in result.markdown + assert "After TOC" in result.markdown + assert "ac:structured-macro" not in result.markdown + + def test_ac_link_resolves_to_page_title(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b""" + +

    See for details.

    +
    """ + result = conv.convert(io.BytesIO(xml), si) + assert "My Other Page" in result.markdown + assert "ac:link" not in result.markdown + + def test_ac_image_attachment_becomes_img(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b""" + + +""" + result = conv.convert(io.BytesIO(xml), si) + assert "diagram.png" in result.markdown + assert "ac:image" not in result.markdown + + def test_emoticons_are_removed(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b""" + +

    Status: Done

    +
    """ + result = conv.convert(io.BytesIO(xml), si) + assert "Done" in result.markdown + assert "ac:emoticon" not in result.markdown + + def test_task_list_becomes_checkboxes(self): + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + xml = b""" + + + + complete + Finished task + + + incomplete + Pending task + + +""" + result = conv.convert(io.BytesIO(xml), si) + assert "[x]" in result.markdown + assert "[ ]" in result.markdown + assert "Finished task" in result.markdown + assert "Pending task" in result.markdown + assert "ac:task-list" not in result.markdown + + def test_full_fixture_content_and_noise(self): + """End-to-end test against the full XML fixture file.""" + conv = ConfluenceStorageConverter() + fixture = os.path.join(TEST_FILES_DIR, "test_confluence_storage.xml") + si = StreamInfo(extension=".xml", mimetype="text/xml") + with open(fixture, "rb") as f: + result = conv.convert(f, si) + + text = _get_text(result) + for s in STORAGE_MUST_CONTAIN: + assert s in text, f"expected {s!r} in output" + for s in STORAGE_MUST_NOT_CONTAIN: + assert s not in text, f"expected {s!r} to be absent from output" + + def test_markitdown_routes_xml_to_storage_converter(self): + """MarkItDown should route .xml with ac: tags to ConfluenceStorageConverter.""" + markitdown = MarkItDown() + fixture = os.path.join(TEST_FILES_DIR, "test_confluence_storage.xml") + result = markitdown.convert(fixture) + + text = _get_text(result) + for s in STORAGE_MUST_CONTAIN: + assert s in text, f"expected {s!r} in output" + + def test_plain_xml_not_routed_to_storage_converter(self): + """A plain XML file without ac:/ri: tags must not be handled by ConfluenceStorageConverter.""" + conv = ConfluenceStorageConverter() + si = StreamInfo(extension=".xml", mimetype="text/xml") + plain = b"RSS item" + assert not conv.accepts(io.BytesIO(plain), si) + + +if __name__ == "__main__": + import pytest + pytest.main([__file__, "-v"]) diff --git a/packages/markitdown/tests/test_files/test_confluence_cloud.html b/packages/markitdown/tests/test_files/test_confluence_cloud.html new file mode 100644 index 000000000..130372118 --- /dev/null +++ b/packages/markitdown/tests/test_files/test_confluence_cloud.html @@ -0,0 +1,42 @@ + + + + Design Decisions - Confluence + + + + + + +
    Sidebar content
    +
    12 likes
    +
    +
    A comment that should not appear.
    +
    + + +
    +

    Design Decisions

    +
    +

    Overview

    +

    This page documents key architecture decisions.

    +

    Goals

    +
      +
    • Simplicity
    • +
    • Reliability
    • +
    • Extensibility
    • +
    +

    Decision Log

    + + + + +
    DecisionStatus
    Use PythonAccepted
    Use REST APIAccepted
    +
    +
    + + + + diff --git a/packages/markitdown/tests/test_files/test_confluence_storage.xml b/packages/markitdown/tests/test_files/test_confluence_storage.xml new file mode 100644 index 000000000..15f897cb9 --- /dev/null +++ b/packages/markitdown/tests/test_files/test_confluence_storage.xml @@ -0,0 +1,62 @@ + + + API Guidelines + +

    API Guidelines

    +

    Follow these guidelines when building internal APIs.

    + + + + python + def get_user(user_id: int) -> dict: + return db.query(User).filter_by(id=user_id).first() + + + + + +

    Always version your API endpoints with /v1/ prefix.

    +
    +
    + + + + +

    Rate limiting is required on all public endpoints.

    +
    +
    + + + + +

    Never expose internal IDs in public responses.

    +
    +
    + + + + + +

    See also:

    + + + + + + + +

    Status: Done

    + + + + + complete + Write API spec + + + incomplete + Add pagination support + + + +