Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions extract-core/extract_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
InputDoc,
MarkdownDoc,
OutputFormat,
PageIndexes,
Pages,
Ranges,
Result,
Status,
SupportedExt,
Expand Down Expand Up @@ -58,7 +59,8 @@
"MinerUConfig",
"MinerUPipelineConfig",
"OutputFormat",
"PageIndexes",
"Ranges",
"Pages",
"Pipeline",
"PipelineType",
"Result",
Expand Down
24 changes: 16 additions & 8 deletions extract-core/extract_core/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import traceback
import uuid
from abc import ABC
from collections.abc import Sequence
from enum import StrEnum
from functools import cache
from io import BytesIO
Expand All @@ -16,7 +17,7 @@
no_enum_values_config,
safe_copy,
)
from pydantic import AfterValidator, RootModel, TypeAdapter
from pydantic import AfterValidator, Field, TypeAdapter
from pydantic import BaseModel as _BaseModel

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -203,18 +204,25 @@ def without_content(self) -> Self:
return safe_copy(self, update={"content": None})


class PageIndexes(RootModel[list[tuple[int, int]]]):
# Stores page end index
Ranges = list[tuple[int, int]]


class Pages(BaseModel):
total: int = 0
byte_ranges: Ranges = []

@classmethod
def from_page_end_indices(cls, lengths: list[int]) -> Self:
return [
((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
]
def from_pages_bytes_sizes(cls, sizes: Sequence[int]) -> Self:
bytes_ranges = []
for p, end in enumerate(sizes):
start = 0 if p == 0 else bytes_ranges[-1][1]
bytes_ranges.append((start, start + end))
return cls(total=len(sizes), byte_ranges=bytes_ranges)


class ConversionOutput(BaseModel):
path: Path
pages: PageIndexes = []
pages: Pages = Field(default_factory=Pages)


class MarkdownDoc(ConversionOutput):
Expand Down
8 changes: 4 additions & 4 deletions extract-python/benches/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import markdown2
import pypdfium2
from extract_core import BaseModel, OutputFormat, PageIndexes
from extract_core import BaseModel, OutputFormat, Pages
from extract_python.utils import chdir
from html2image import Html2Image
from PIL import Image, ImageDraw
Expand Down Expand Up @@ -93,7 +93,7 @@ def side_by_side_md_page_comp(
if len(md_files) != 1:
msg = f"unexpected number of md files ({len(md_files)}) in {compared_path}"
raise ValueError(msg)
md_content = md_files[0].read_text()[page_ix[0] : page_ix[1]]
md_content = (md_files[0].read_bytes()[page_ix[0] : page_ix[1]]).decode()
# change the current dir so that the browser renders images properly
with chdir(compared_path):
md_page_im = _render_md(md_content, compared_path, html_size=ref_im.size)
Expand Down Expand Up @@ -140,9 +140,9 @@ def _scan_pages(
root: Path, comparison: ComparisonItem
) -> list[dict[str, tuple[int, int]]]:
all_pages = [
PageIndexes.model_validate_json(
Pages.model_validate_json(
(root / compared / "artifacts" / "pages.json").read_text()
).root
)
for compared in comparison.compared
]
all_pages = zip(*all_pages, strict=True)
Expand Down
2 changes: 1 addition & 1 deletion extract-python/extract_python/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ARTIFACTS = "artifacts"
DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
56 changes: 27 additions & 29 deletions extract-python/extract_python/docling_.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
InputDoc,
MarkdownDoc,
OutputFormat,
PageIndexes,
Pipeline,
PipelineType,
Result,
Expand All @@ -34,7 +33,7 @@
from pydantic_core.core_schema import SerializerFunctionWrapHandler

from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -115,39 +114,38 @@ def _to_markdown_doc(
raise FileExistsError(f"directory {md_dir} already exists")
# Let's avoid issue of duplicated input file names flattened top level
md_filename = md_dir_name + OutputFormat.MARKDOWN
total_length = 0
n_pages = len(res.pages)

with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
tmp_dir = Path(td)
page_path = Path("page.md")
# We do a chdir to bypass a Docling bug which only allows to maintain relative
# image ref when saving the markdown to a relative path
with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
end_indices = []
for page_i in range(n_pages):
res.document.save_as_markdown(
page_path,
page_no=page_i + 1,
image_mode=ImageRefMode.REFERENCED,
artifacts_dir=Path(ARTIFACTS),
**kwargs,
)
content = page_path.read_text()
if page_i > 0:
content += "\n"
if page_i < n_pages - 1:
content += page_sep
total_length += len(content)
end_indices.append(total_length)
f.write(content)
f.flush()
page_path.unlink()
md_path = tmp_dir / md_filename
current_page_path = tmp_dir / "page.md"
with chdir(tmp_dir):
# We do a chdir to bypass a Docling bug which only allows to maintain
# relative image ref when saving the markdown to a relative path
pages = _docling_pages_it(res, current_page_path, **kwargs)
with md_path.open("wb") as f:
pages = write_pages(pages, page_sep, f)
# Clean up the tmp page file before move everything to the end destination
current_page_path.unlink(missing_ok=True)
shutil.move(tmp_dir, md_dir)
pages = PageIndexes.from_page_end_indices(end_indices)
return MarkdownDoc(path=Path(md_dir_name), pages=pages)


def _docling_pages_it(
res: ConversionResult, output_path: Path, **kwargs
) -> Iterable[str]:
n_pages = len(res.pages)
for page_i in range(n_pages):
res.document.save_as_markdown(
output_path,
page_no=page_i + 1,
image_mode=ImageRefMode.REFERENCED,
artifacts_dir=Path(ARTIFACTS),
**kwargs,
)
content = output_path.read_text()
yield content


class SerializableFormatOptions(DoclingFormatOption):
# Utility class to serialize Python format options into a JSON which can be
# correctly deserialized into a docling FormatOption
Expand Down
41 changes: 16 additions & 25 deletions extract-python/extract_python/marker_.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@
InputDoc,
MarkdownDoc,
OutputFormat,
PageIndexes,
Pipeline,
PipelineType,
Result,
Status,
)

from .constants import ARTIFACTS
from .utils import path_to_artifacts_dirname, report_recoverable_errors
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages

if TYPE_CHECKING:
from marker.converters.pdf import PdfConverter
Expand Down Expand Up @@ -63,15 +62,22 @@ async def _process_doc(
content, _, images = text_from_rendered(rendered)
match output_format:
case OutputFormat.MARKDOWN:
output = _to_markdown_doc(doc, content, images, output_path)
output = _to_markdown_doc(
doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
)
case _:
raise NotImplementedError(f"unsupported output format {output_format}")
input_doc = doc.without_content()
return Result(input=input_doc, status=Status.SUCCESS, output=output)


def _to_markdown_doc(
input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
input_doc: InputDoc,
content: str,
images: dict[str, "Image"],
output_path: Path,
*,
page_sep: str = DEFAULT_MD_PAGE_SEP,
) -> MarkdownDoc:
from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415

Expand All @@ -85,24 +91,9 @@ def _to_markdown_doc(
im.save(artifacts_dir / im_name)
del images
gc.collect()
page_sep = MarkdownRenderer.page_separator
content = content.split(page_sep)
n_pages = len(content)
md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
OutputFormat.MARKDOWN.value
)
total_length = 0
end_indices = []
with md_path.open("w", encoding="utf-8") as f:
for page_i, page_content in enumerate(content):
content = page_content
if page_i > 0:
content += "\n"
if page_i < n_pages - 1:
content += page_sep
total_length += len(content)
end_indices.append(total_length)
f.write(content)
f.flush()
pages = PageIndexes.from_page_end_indices(end_indices)
pages = content.split(MarkdownRenderer.page_separator)
md_path = output_path / md_dir_name / md_dir_name
md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
with md_path.open("wb") as f:
pages = write_pages(pages, page_sep, f)
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
23 changes: 5 additions & 18 deletions extract-python/extract_python/miner_u.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
MinerUBackend,
MinerUPipelineConfig,
OutputFormat,
PageIndexes,
Pipeline,
PipelineType,
Result,
Status,
)

from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
from .utils import path_to_artifacts_dirname, reset_env
from .utils import path_to_artifacts_dirname, reset_env, write_pages

_MINER_U_CONVERSION_ERRORS = tuple()
MDMakeFunction = Callable[[list, str, str], str | None]
Expand Down Expand Up @@ -148,21 +147,9 @@ def _dump_md_content(

if md_make_mode is None:
md_make_mode = MakeMode.MM_MD
total_length = 0
end_indices = []
with md_path.open("w") as f:
n_pages = len(pdf_info)
for page_i, page in enumerate(pdf_info):
content = md_make_fn([page], md_make_mode, str(im_dir))
if page_i > 0:
content += "\n"
if page_i < n_pages - 1:
content += page_sep
total_length += len(content)
end_indices.append(total_length)
f.write(content)
f.flush()
end_indices = PageIndexes.from_page_end_indices(end_indices)
pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
with md_path.open("wb") as f:
pages = write_pages(pages, page_sep, f)
output_path = md_path.parent.relative_to(output_path)
output = ConversionOutput(path=output_path, pages=end_indices)
output = ConversionOutput(path=output_path, pages=pages)
return output
17 changes: 15 additions & 2 deletions extract-python/extract_python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from functools import wraps
from itertools import tee
from pathlib import Path, PurePath
from typing import Protocol, TypeVar
from typing import BinaryIO, Protocol, TypeVar

from extract_core import Error, InputDoc, Result, Status
from extract_core import Error, InputDoc, Pages, Result, Status

R = TypeVar("R")
In = TypeVar("In")
Expand Down Expand Up @@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
finally:
os.environ.clear()
os.environ.update(old_env)


def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
pages_byte_sizes = []
pages = iter(pages)
content = None
for p in pages:
if content:
pages_byte_sizes.append(out.write((content + page_sep).encode()))
content = p
if content:
pages_byte_sizes.append(out.write(content.encode()))
return Pages.from_pages_bytes_sizes(pages_byte_sizes)
2 changes: 1 addition & 1 deletion extract-python/tests/test_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ async def test_docling_pdf_to_markdown(
assert (output_path / p).is_dir()
assert (output_path / p / p.name).with_suffix(".md").exists()
assert any((output_path / p).glob("artifacts/*.png"))
assert all(r.output.pages for r in res)
assert all(r.output.pages.byte_ranges for r in res)
assert not any(r.errors for r in res)
input_path = [r.input.path for r in res]
expected_input_path = [
Expand Down
2 changes: 1 addition & 1 deletion extract-python/tests/test_marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ async def test_marker_pdf_to_markdown(
assert (output_path / p).is_dir()
assert (output_path / p / p.name).with_suffix(".md").exists()
assert any((output_path / p).glob("artifacts/*.jpeg"))
assert all(r.output.pages for r in res)
assert all(r.output.pages.byte_ranges for r in res)
assert not any(r.errors for r in res)
input_path = [r.input.path for r in res]
expected_path = [
Expand Down
2 changes: 1 addition & 1 deletion extract-python/tests/test_miner_u.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ async def test_miner_u_pdf_to_markdown(
assert (output_path / p).is_dir()
assert (output_path / p / p.name).with_suffix(".md").exists()
assert any((output_path / p).glob("artifacts/*.jpg"))
assert all(r.output.pages for r in res)
assert all(r.output.pages.byte_ranges for r in res)
assert not any(r.errors for r in res)
input_path = [r.input.path for r in res]
expected_path = [
Expand Down
Loading