]> _ Git - cubist_pdf.git/commitdiff
#7678 @0.5
authorVincent Vanwaelscappel <vincent@cubedesigners.com>
Mon, 4 Aug 2025 14:25:00 +0000 (16:25 +0200)
committerVincent Vanwaelscappel <vincent@cubedesigners.com>
Mon, 4 Aug 2025 14:25:00 +0000 (16:25 +0200)
resources/tools/docling/convert_page.py

index 86d4f7af35de7aeee47aed5c08ce8e199ef2dc86..ff80a37ddddcaab2050b2c5a553df5e5157ab492 100644 (file)
@@ -21,5 +21,13 @@ converter = DocumentConverter(
     }
 )
 result = converter.convert(sys.argv[1])
+allpages=result.document.export_to_markdown(page_break_placeholder="<!-- page break -->", image_mode=ImageRefMode.EMBEDDED);
+Path(sys.argv[2]+"document.md").write_text(allpages)
+i=0
+for md in allpages.split("<!-- page break -->"):
+    i+=1
+    while not converter.convert(sys.argv[1]).pages[1].parsed_page.has_chars:
+        Path(sys.argv[2]+"p"+str(i)+".md").write_text("")
+        i+=1
+    Path(sys.argv[2]+"p"+str(i)+".md").write_text(md)
 
-Path(sys.argv[2]).write_text(result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED))