Skip to content

convert_url renamed to convert_uri, and now handles data and file URIs #1153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 75 additions & 15 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import codecs

from ._stream_info import StreamInfo
from ._uri_utils import parse_data_uri, file_uri_to_path

from .converters import (
PlainTextConverter,
Expand Down Expand Up @@ -242,9 +243,10 @@ def convert(
# Local path or url
if isinstance(source, str):
if (
source.startswith("http://")
or source.startswith("https://")
or source.startswith("file://")
source.startswith("http:")
or source.startswith("https:")
or source.startswith("file:")
or source.startswith("data:")
):
# Rename the url argument to mock_url
# (Deprecated -- use stream_info)
Expand All @@ -253,7 +255,7 @@ def convert(
_kwargs["mock_url"] = _kwargs["url"]
del _kwargs["url"]

return self.convert_url(source, stream_info=stream_info, **_kwargs)
return self.convert_uri(source, stream_info=stream_info, **_kwargs)
else:
return self.convert_local(source, stream_info=stream_info, **kwargs)
# Path object
Expand Down Expand Up @@ -363,23 +365,81 @@ def convert_url(
url: str,
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None, # Deprecated -- use stream_info
mock_url: Optional[
str
] = None, # Mock the request as if it came from a different URL
file_extension: Optional[str] = None,
mock_url: Optional[str] = None,
**kwargs: Any,
) -> DocumentConverterResult: # TODO: fix kwargs type
# Send a HTTP request to the URL
response = self._requests_session.get(url, stream=True)
response.raise_for_status()
return self.convert_response(
response,
) -> DocumentConverterResult:
"""Alias for convert_uri()"""
# convert_url will likely be deprecated in the future in favor of convert_uri
return self.convert_uri(
url,
stream_info=stream_info,
file_extension=file_extension,
url=mock_url,
mock_url=mock_url,
**kwargs,
)

def convert_uri(
self,
uri: str,
*,
stream_info: Optional[StreamInfo] = None,
file_extension: Optional[str] = None, # Deprecated -- use stream_info
mock_url: Optional[
str
] = None, # Mock the request as if it came from a different URL
**kwargs: Any,
) -> DocumentConverterResult:
uri = uri.strip()

# File URIs
if uri.startswith("file:"):
netloc, path = file_uri_to_path(uri)
if netloc and netloc != "localhost":
raise ValueError(
f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
)
return self.convert_local(
path,
stream_info=stream_info,
file_extension=file_extension,
url=mock_url,
**kwargs,
)
# Data URIs
elif uri.startswith("data:"):
mimetype, attributes, data = parse_data_uri(uri)

base_guess = StreamInfo(
mimetype=mimetype,
charset=attributes.get("charset"),
)
if stream_info is not None:
base_guess = base_guess.copy_and_update(stream_info)

return self.convert_stream(
io.BytesIO(data),
stream_info=base_guess,
file_extension=file_extension,
url=mock_url,
**kwargs,
)
# HTTP/HTTPS URIs
elif uri.startswith("http:") or uri.startswith("https:"):
response = self._requests_session.get(uri, stream=True)
response.raise_for_status()
return self.convert_response(
response,
stream_info=stream_info,
file_extension=file_extension,
url=mock_url,
**kwargs,
)
else:
raise ValueError(
f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
)

def convert_response(
self,
response: requests.Response,
Expand Down
52 changes: 52 additions & 0 deletions packages/markitdown/src/markitdown/_uri_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import base64
import os
from typing import Tuple, Dict
from urllib.request import url2pathname
from urllib.parse import urlparse, unquote_to_bytes


def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
"""Convert a file URI to a local file path"""
parsed = urlparse(file_uri)
if parsed.scheme != "file":
raise ValueError(f"Not a file URL: {file_uri}")

netloc = parsed.netloc if parsed.netloc else None
path = os.path.abspath(url2pathname(parsed.path))
return netloc, path


def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
if not uri.startswith("data:"):
raise ValueError("Not a data URI")

header, _, data = uri.partition(",")
if not _:
raise ValueError("Malformed data URI, missing ',' separator")

meta = header[5:] # Strip 'data:'
parts = meta.split(";")

is_base64 = False
# Ends with base64?
if parts[-1] == "base64":
parts.pop()
is_base64 = True

mime_type = None # Normally this would default to text/plain but we won't assume
if len(parts) and len(parts[0]) > 0:
# First part is the mime type
mime_type = parts.pop(0)

attributes: Dict[str, str] = {}
for part in parts:
# Handle key=value pairs in the middle
if "=" in part:
key, value = part.split("=", 1)
attributes[key] = value
elif len(part) > 0:
attributes[part] = ""

content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)

return mime_type, attributes, content
77 changes: 77 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import openai
import pytest

from markitdown._uri_utils import parse_data_uri, file_uri_to_path

from markitdown import (
MarkItDown,
UnsupportedFormatException,
Expand Down Expand Up @@ -176,6 +178,79 @@ def test_stream_info_operations() -> None:
assert updated_stream_info.url == "url.1"


def test_data_uris() -> None:
# Test basic parsing of data URIs
data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
mime_type, attributes, data = parse_data_uri(data_uri)
assert mime_type == "text/plain"
assert len(attributes) == 0
assert data == b"Hello, World!"

data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
mime_type, attributes, data = parse_data_uri(data_uri)
assert mime_type is None
assert len(attributes) == 0
assert data == b"Hello, World!"

data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
mime_type, attributes, data = parse_data_uri(data_uri)
assert mime_type == "text/plain"
assert len(attributes) == 1
assert attributes["charset"] == "utf-8"
assert data == b"Hello, World!"

data_uri = "data:,Hello%2C%20World%21"
mime_type, attributes, data = parse_data_uri(data_uri)
assert mime_type is None
assert len(attributes) == 0
assert data == b"Hello, World!"

data_uri = "data:text/plain,Hello%2C%20World%21"
mime_type, attributes, data = parse_data_uri(data_uri)
assert mime_type == "text/plain"
assert len(attributes) == 0
assert data == b"Hello, World!"

data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
mime_type, attributes, data = parse_data_uri(data_uri)
assert mime_type == "text/plain"
assert len(attributes) == 1
assert attributes["charset"] == "utf-8"
assert data == b"Hello, World!"


def test_file_uris() -> None:
# Test file URI with an empty host
file_uri = "file:///path/to/file.txt"
netloc, path = file_uri_to_path(file_uri)
assert netloc is None
assert path == "/path/to/file.txt"

# Test file URI with no host
file_uri = "file:/path/to/file.txt"
netloc, path = file_uri_to_path(file_uri)
assert netloc is None
assert path == "/path/to/file.txt"

# Test file URI with localhost
file_uri = "file://localhost/path/to/file.txt"
netloc, path = file_uri_to_path(file_uri)
assert netloc == "localhost"
assert path == "/path/to/file.txt"

# Test file URI with query parameters
file_uri = "file:///path/to/file.txt?param=value"
netloc, path = file_uri_to_path(file_uri)
assert netloc is None
assert path == "/path/to/file.txt"

# Test file URI with fragment
file_uri = "file:///path/to/file.txt#fragment"
netloc, path = file_uri_to_path(file_uri)
assert netloc is None
assert path == "/path/to/file.txt"


def test_docx_comments() -> None:
markitdown = MarkItDown()

Expand Down Expand Up @@ -314,6 +389,8 @@ def test_markitdown_llm() -> None:
"""Runs this file's tests from the command line."""
for test in [
test_stream_info_operations,
test_data_uris,
test_file_uris,
test_docx_comments,
test_input_as_strings,
test_markitdown_remote,
Expand Down
54 changes: 47 additions & 7 deletions packages/markitdown/tests/test_module_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import time
import pytest
import codecs
import base64

from pathlib import Path

if __name__ == "__main__":
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
Expand Down Expand Up @@ -108,8 +110,8 @@ def test_convert_stream_without_hints(test_vector):
reason="do not run tests that query external urls",
)
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
def test_convert_url(test_vector):
"""Test the conversion of a stream with no stream info."""
def test_convert_http_uri(test_vector):
"""Test the conversion of an HTTP:// or HTTPS:// URI."""
markitdown = MarkItDown()

time.sleep(1) # Ensure we don't hit rate limits
Expand All @@ -124,8 +126,44 @@ def test_convert_url(test_vector):
assert string not in result.markdown


@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
def test_convert_file_uri(test_vector):
"""Test the conversion of a file:// URI."""
markitdown = MarkItDown()

result = markitdown.convert(
Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
url=test_vector.url,
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown


@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
def test_convert_data_uri(test_vector):
"""Test the conversion of a data URI."""
markitdown = MarkItDown()

data = ""
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
data = base64.b64encode(stream.read()).decode("utf-8")
mimetype = test_vector.mimetype
data_uri = f"data:{mimetype};base64,{data}"

result = markitdown.convert(
data_uri,
url=test_vector.url,
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown


@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_with_data_uris(test_vector):
def test_convert_keep_data_uris(test_vector):
"""Test API functionality when keep_data_uris is enabled"""
markitdown = MarkItDown()

Expand All @@ -143,7 +181,7 @@ def test_convert_with_data_uris(test_vector):


@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_stream_with_data_uris(test_vector):
def test_convert_stream_keep_data_uris(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()

Expand Down Expand Up @@ -175,7 +213,9 @@ def test_convert_stream_with_data_uris(test_vector):
test_convert_local,
test_convert_stream_with_hints,
test_convert_stream_without_hints,
test_convert_url,
test_convert_http_uri,
test_convert_file_uri,
test_convert_data_uri,
]:
for test_vector in GENERAL_TEST_VECTORS:
print(
Expand All @@ -186,8 +226,8 @@ def test_convert_stream_with_data_uris(test_vector):

# Data URI tests
for test_function in [
test_convert_with_data_uris,
test_convert_stream_with_data_uris,
test_convert_keep_data_uris,
test_convert_stream_keep_data_uris,
]:
for test_vector in DATA_URI_TEST_VECTORS:
print(
Expand Down