<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
+ <component name="FacetManager">
+ <facet type="Python" name="Python">
+ <configuration sdkName="Python 3.13" />
+ </facet>
+ </component>
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
+ <orderEntry type="library" name="Python 3.13 interpreter library" level="application" />
+ </component>
+ <component name="PackageRequirementsSettings">
+ <option name="requirementsPath" value="" />
</component>
</module>
\ No newline at end of file
+<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AhkProjectSettings">
<option name="defaultAhkSdk" value="AutoHotkey" />
</component>
- <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" project-jdk-name="18" project-jdk-type="JavaSDK">
+ <component name="Black">
+ <option name="sdkName" value="Python 3.12" />
+ </component>
+ <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>
\ No newline at end of file
--- /dev/null
+import sys
+from pathlib import Path
+
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.datamodel.document import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.types.doc import ImageRefMode
+
+pipeline_options = PdfPipelineOptions()
+pipeline_options.do_ocr = True
+pipeline_options.do_table_structure = True
+pipeline_options.table_structure_options.do_cell_matching = True
+converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ backend=DoclingParseV4DocumentBackend,
+ pipeline_options=PdfPipelineOptions(generate_picture_images=True)
+ )
+ }
+)
+result = converter.convert(sys.argv[1])
+i=0
+for md in result.document.export_to_markdown(page_break_placeholder="<!-- page break -->", image_mode=ImageRefMode.EMBEDDED).split("<!-- page break -->"):
+ i+=1
+ Path(sys.argv[2]+"p"+str(i)+".md").write_text(md)
\ No newline at end of file
<workItem from="1732557350064" duration="14000" />
<workItem from="1748349213267" duration="1488000" />
<workItem from="1748351552423" duration="1932000" />
- <workItem from="1748355409566" duration="636000" />
+ <workItem from="1748355409566" duration="1244000" />
+ <workItem from="1748356736199" duration="3367000" />
</task>
<task id="LOCAL-00001" summary="wip #1111 @0.5">
<created>1487172253077</created>
}
}
+ public static function extractAccessibleHTML($pdf, $out)
+ {
+ $docling = new CommandLine('python');
+ $docling->setArg(null, self::resource_path('tools/docling/convert_page.py'));
+ $docling->setArg(null, $pdf);
+ $docling->setArg(null, Files::mkdir($out . '/docling'));
+ $docling->execute();
+ $docling->debug();
+ }
+
public static function extractHighlightsData($pdf, $out, $mode = 'standard', $ignoreSeparators = '')
{
$out .= 'texts';