]> _ Git - cubist_pdf.git/commitdiff
#7678 @0.5
authorVincent Vanwaelscappel <vincent@cubedesigners.com>
Mon, 4 Aug 2025 13:14:57 +0000 (15:14 +0200)
committerVincent Vanwaelscappel <vincent@cubedesigners.com>
Mon, 4 Aug 2025 13:14:57 +0000 (15:14 +0200)
resources/tools/docling/convert_page.py

index 1e294696a403eeea24d1bbc5adbbb1e15fb4a615..7cbf9a4e182346ddb1554410a0d651aa24986c2c 100644 (file)
@@ -1,4 +1,5 @@
 import sys
+import os
 from pathlib import Path
 
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
@@ -23,7 +24,12 @@ result = converter.convert(sys.argv[1])
 allpages=result.document.export_to_markdown(page_break_placeholder="<!-- page break -->", image_mode=ImageRefMode.EMBEDDED);
 Path(sys.argv[2]+"document.md").write_text(allpages)
 i=0
+j=0
 for md in allpages.split("<!-- page break -->"):
     i+=1
-    Path(sys.argv[2]+"p"+str(i)+".md").write_text(md)
+    j+=1
+    while os.stats(Path(sys.argv[2] + "/../texts/ph{j}.html")).st_size < 21:
+        Path(sys.argv[2]+"p"+str(j)+".md").write_text("")
+        j+=1
+    Path(sys.argv[2]+"p"+str(j)+".md").write_text(md)