From ccd5d6e8d14444231350b4f57bd6ed8144503466 Mon Sep 17 00:00:00 2001 From: Vincent Vanwaelscappel Date: Thu, 3 Jul 2025 16:00:46 +0200 Subject: [PATCH] wip #7556 @4 --- .idea/cubist_pdf.iml | 9 ++++++++ .idea/misc.xml | 6 ++++- resources/tools/docling/convert_page.py | 27 +++++++++++++++++++++++ resources/tools/fwstk/.idea/workspace.xml | 3 ++- src/PDFTools.php | 10 +++++++++ 5 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 resources/tools/docling/convert_page.py diff --git a/.idea/cubist_pdf.iml b/.idea/cubist_pdf.iml index 3f1fb7b..06ceace 100644 --- a/.idea/cubist_pdf.iml +++ b/.idea/cubist_pdf.iml @@ -1,5 +1,10 @@ + + + + + @@ -85,5 +90,9 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3761743..921560d 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,8 +1,12 @@ + - + + + \ No newline at end of file diff --git a/resources/tools/docling/convert_page.py b/resources/tools/docling/convert_page.py new file mode 100644 index 0000000..402a21e --- /dev/null +++ b/resources/tools/docling/convert_page.py @@ -0,0 +1,27 @@ +import sys +from pathlib import Path + +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend +from docling.datamodel.document import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling_core.types.doc import ImageRefMode + +pipeline_options = PdfPipelineOptions() +pipeline_options.do_ocr = True +pipeline_options.do_table_structure = True +pipeline_options.table_structure_options.do_cell_matching = True +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + backend=DoclingParseV4DocumentBackend, + pipeline_options=PdfPipelineOptions(generate_picture_images=True) + ) + } +) +result = converter.convert(sys.argv[1]) +i=0 +for md in result.document.export_to_markdown(page_break_placeholder="", image_mode=ImageRefMode.EMBEDDED).split(""): + i+=1 + Path(sys.argv[2]+"p"+str(i)+".md").write_text(md) \ No newline at end of file diff --git a/resources/tools/fwstk/.idea/workspace.xml b/resources/tools/fwstk/.idea/workspace.xml index 0668053..069fa80 100644 --- a/resources/tools/fwstk/.idea/workspace.xml +++ b/resources/tools/fwstk/.idea/workspace.xml @@ -517,7 +517,8 @@ - + + 1487172253077 diff --git a/src/PDFTools.php b/src/PDFTools.php index c421365..d7f3438 100644 --- a/src/PDFTools.php +++ b/src/PDFTools.php @@ -589,6 +589,16 @@ class PDFTools } } + public static function extractAccessibleHTML($pdf, $out) + { + $docling = new CommandLine('python'); + $docling->setArg(null, self::resource_path('tools/docling/convert_page.py')); + $docling->setArg(null, $pdf); + $docling->setArg(null, Files::mkdir($out . '/docling')); + $docling->execute(); + $docling->debug(); + } + public static function extractHighlightsData($pdf, $out, $mode = 'standard', $ignoreSeparators = '') { $out .= 'texts'; -- 2.39.5