From: Vincent Vanwaelscappel Date: Mon, 4 Aug 2025 13:14:57 +0000 (+0200) Subject: #7678 @0.5 X-Git-Url: http://git.cubedesigners.com/?a=commitdiff_plain;h=25dc08e88882e31b8aab0fe88b768d26442fa93b;p=cubist_pdf.git #7678 @0.5 --- diff --git a/resources/tools/docling/convert_page.py b/resources/tools/docling/convert_page.py index 1e29469..7cbf9a4 100644 --- a/resources/tools/docling/convert_page.py +++ b/resources/tools/docling/convert_page.py @@ -1,4 +1,5 @@ import sys +import os from pathlib import Path from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend @@ -23,7 +24,12 @@ result = converter.convert(sys.argv[1]) allpages=result.document.export_to_markdown(page_break_placeholder="", image_mode=ImageRefMode.EMBEDDED); Path(sys.argv[2]+"document.md").write_text(allpages) i=0 +j=0 for md in allpages.split(""): i+=1 - Path(sys.argv[2]+"p"+str(i)+".md").write_text(md) + j+=1 + while os.stats(Path(sys.argv[2] + "/../texts/ph{j}.html")).st_size < 21: + Path(sys.argv[2]+"p"+str(j)+".md").write_text("") + j+=1 + Path(sys.argv[2]+"p"+str(j)+".md").write_text(md)