]> _ Git - cubist_pdf.git/commitdiff
wip #7556 @4
authorVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 3 Jul 2025 14:00:46 +0000 (16:00 +0200)
committerVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 3 Jul 2025 14:00:46 +0000 (16:00 +0200)
.idea/cubist_pdf.iml
.idea/misc.xml
resources/tools/docling/convert_page.py [new file with mode: 0644]
resources/tools/fwstk/.idea/workspace.xml
src/PDFTools.php

index 3f1fb7b0d53bb531ab212815a36b7c2b2f623ddd..06ceace079eca909e97492d984d1e0b40918eb48 100644 (file)
@@ -1,5 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="JAVA_MODULE" version="4">
+  <component name="FacetManager">
+    <facet type="Python" name="Python">
+      <configuration sdkName="Python 3.13" />
+    </facet>
+  </component>
   <component name="NewModuleRootManager" inherit-compiler-output="true">
     <exclude-output />
     <content url="file://$MODULE_DIR$">
@@ -85,5 +90,9 @@
     </content>
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="Python 3.13 interpreter library" level="application" />
+  </component>
+  <component name="PackageRequirementsSettings">
+    <option name="requirementsPath" value="" />
   </component>
 </module>
\ No newline at end of file
index 3761743f2ec011700e2045b4e2ce856b27661577..921560d3f803aa107e24501eb037416c254c5c2a 100644 (file)
@@ -1,8 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="AhkProjectSettings">
     <option name="defaultAhkSdk" value="AutoHotkey" />
   </component>
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" project-jdk-name="18" project-jdk-type="JavaSDK">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.12" />
+  </component>
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6">
     <output url="file://$PROJECT_DIR$/out" />
   </component>
 </project>
\ No newline at end of file
diff --git a/resources/tools/docling/convert_page.py b/resources/tools/docling/convert_page.py
new file mode 100644 (file)
index 0000000..402a21e
--- /dev/null
@@ -0,0 +1,27 @@
+import sys
+from pathlib import Path
+
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.datamodel.document import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.types.doc import ImageRefMode
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.do_ocr = True
+pipeline_options.do_table_structure = True
+pipeline_options.table_structure_options.do_cell_matching = True
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            backend=DoclingParseV4DocumentBackend,
+            pipeline_options=PdfPipelineOptions(generate_picture_images=True)
+        )
+    }
+)
+result = converter.convert(sys.argv[1])
+i=0
+for md in result.document.export_to_markdown(page_break_placeholder="<!-- page break -->", image_mode=ImageRefMode.EMBEDDED).split("<!-- page break -->"):
+    i+=1
+    Path(sys.argv[2]+"p"+str(i)+".md").write_text(md)
\ No newline at end of file
index 06680535f45e2f8f645f33b36fbc7a7534e32497..069fa80f70e003cf6604640582bdc3bf9b36b107 100644 (file)
       <workItem from="1732557350064" duration="14000" />
       <workItem from="1748349213267" duration="1488000" />
       <workItem from="1748351552423" duration="1932000" />
-      <workItem from="1748355409566" duration="636000" />
+      <workItem from="1748355409566" duration="1244000" />
+      <workItem from="1748356736199" duration="3367000" />
     </task>
     <task id="LOCAL-00001" summary="wip #1111 @0.5">
       <created>1487172253077</created>
index c4213652bf468f6a7812f150f69c61882a9bc7db..d7f3438e9ee83367917d9a2155f40285463d6b11 100644 (file)
@@ -589,6 +589,16 @@ class PDFTools
         }
     }
 
+    public static function extractAccessibleHTML($pdf, $out)
+    {
+        $docling = new CommandLine('python');
+        $docling->setArg(null, self::resource_path('tools/docling/convert_page.py'));
+        $docling->setArg(null, $pdf);
+        $docling->setArg(null,  Files::mkdir($out . '/docling'));
+        $docling->execute();
+        $docling->debug();
+    }
+
     public static function extractHighlightsData($pdf, $out, $mode = 'standard', $ignoreSeparators = '')
     {
         $out .= 'texts';