From 8dc747a96f6fd54565bfd9eb07a789796b279f9b Mon Sep 17 00:00:00 2001 From: Vincent Vanwaelscappel Date: Wed, 15 Sep 2021 18:37:05 +0200 Subject: [PATCH] wip #4666 @0.75 --- .gitignore | 2 + .idea/artifacts/fwstk.xml | 21 +++ .idea/deployment.xml | 2 +- .idea/fluidbook_tools.iml | 98 +++++++++++++- .idea/misc.xml | 2 +- .../fwstk/src/com/fluidbook/fwstk/Main.java | 120 +++++++++--------- src/PDF/Document.php | 3 +- 7 files changed, 185 insertions(+), 63 deletions(-) create mode 100644 .idea/artifacts/fwstk.xml diff --git a/.gitignore b/.gitignore index d85eee5..1e01c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,5 @@ Homestead.json /.vagrant .phpunit.result.cache +resources/tools/fwstk/bin +resources/tools/fwstk/out diff --git a/.idea/artifacts/fwstk.xml b/.idea/artifacts/fwstk.xml new file mode 100644 index 0000000..d3548ed --- /dev/null +++ b/.idea/artifacts/fwstk.xml @@ -0,0 +1,21 @@ + + + $PROJECT_DIR$/resources/tools/fwstk/out/artifacts/fwstk_jar + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/deployment.xml b/.idea/deployment.xml index a03963b..5e83037 100644 --- a/.idea/deployment.xml +++ b/.idea/deployment.xml @@ -1,6 +1,6 @@ - + diff --git a/.idea/fluidbook_tools.iml b/.idea/fluidbook_tools.iml index c30c3f6..192d012 100644 --- a/.idea/fluidbook_tools.iml +++ b/.idea/fluidbook_tools.iml @@ -1,8 +1,12 @@ - + + + + + @@ -67,8 +71,100 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index b658e95..ff9d5a0 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + \ No newline at end of file diff --git a/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java b/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java index 4c3ffc5..56ae470 100644 --- a/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java +++ b/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java @@ -11,6 +11,8 @@ import java.util.ArrayList; import java.util.Calendar; import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.StringEscapeUtils; import org.apache.pdfbox.exceptions.COSVisitorException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; @@ -44,16 +46,16 @@ import org.apache.pdfbox.util.PDFImageWriter; public class Main { - private static String version = "0.93"; - private static String date = "20110904"; + private static String version = "0.94"; + private static String date = "20210915"; static int threads = 1; static String input = ""; static Float linkOffsetX = 0.0f; static Float linkOffsetY = 0.0f; public static void main(String[] args) throws IOException, - COSVisitorException, ClassNotFoundException, NullPointerException, - InterruptedException { + COSVisitorException, ClassNotFoundException, NullPointerException, + InterruptedException { if (args.length < 0) { printUsage(); @@ -91,7 +93,7 @@ public class Main { printVersion(); return; } else if (args[i].trim().compareTo("-h") == 0 - || args[i].trim().compareTo("--help") == 0) { + || args[i].trim().compareTo("--help") == 0) { printUsage(); return; } else if (args[i].trim().compareTo("--trim") == 0) { @@ -242,10 +244,10 @@ public class Main { stripper.process(nextPage, i); Page layout = stripper.getLayout(); BufferedWriter out - = new BufferedWriter( - new OutputStreamWriter( - new FileOutputStream(layoutOutput.replace("%d", "" - + i)), "UTF-8")); + = new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(layoutOutput.replace("%d", "" + + i)), "UTF-8")); out.write(layout.asJSON()); out.close(); } @@ -253,10 +255,11 @@ public class Main { private static Boolean getInfos(PDDocument doc) throws IOException, COSVisitorException { ArrayList res = new ArrayList<>(); + res.add("Parsed by FWSTK: \t\t\t" + Main.version + " (" + Main.date + ")"); // General informations String[] fields = {"Author", "Title", "CreationDate", "Creator", - "Keywords", "ModificationDate", "Producer", "Subject", - "Trapped", "Version"}; + "Keywords", "ModificationDate", "Producer", "Subject", + "Trapped", "Version"}; PDDocumentInformation infos = doc.getDocumentInformation(); for (String k : fields) { String v = infos.getCustomMetadataValue(k); @@ -268,6 +271,7 @@ public class Main { List list = doc.getDocumentCatalog().getAllPages(); int pages = list.size(); + res.add("Pages:\t\t\t" + pages); Boolean changes = false; for (int i = 0; i < pages; i++) { @@ -299,47 +303,47 @@ public class Main { // Size if (page.getRotation() != null - && (page.getRotation() == 90 || page.getRotation() == 270)) { + && (page.getRotation() == 90 || page.getRotation() == 270)) { res.add("Page " + numero + " size:\t\t" - + Math.abs(cropBox.getHeight()) + " pts x " - + Math.abs(cropBox.getWidth()) + " pts"); + + Math.abs(cropBox.getHeight()) + " pts x " + + Math.abs(cropBox.getWidth()) + " pts"); // Boxes res.add("Page " + numero + " CropBox:\t" - + cropBox.getLowerLeftY() + "\t" - + cropBox.getUpperRightX() + "\t" - + cropBox.getUpperRightY() + "\t" - + cropBox.getLowerLeftX() + "\t"); + + cropBox.getLowerLeftY() + "\t" + + cropBox.getUpperRightX() + "\t" + + cropBox.getUpperRightY() + "\t" + + cropBox.getLowerLeftX() + "\t"); res.add("Page " + numero + " MediaBox:\t" - + mediaBox.getLowerLeftY() + "\t" - + mediaBox.getUpperRightX() + "\t" - + mediaBox.getUpperRightY() + "\t" - + mediaBox.getLowerLeftX() + "\t"); + + mediaBox.getLowerLeftY() + "\t" + + mediaBox.getUpperRightX() + "\t" + + mediaBox.getUpperRightY() + "\t" + + mediaBox.getLowerLeftX() + "\t"); res.add("Page " + numero + " TrimBox:\t" - + trimBox.getLowerLeftY() + "\t" - + trimBox.getUpperRightX() + "\t" - + trimBox.getUpperRightY() + "\t" - + trimBox.getLowerLeftX() + "\t"); + + trimBox.getLowerLeftY() + "\t" + + trimBox.getUpperRightX() + "\t" + + trimBox.getUpperRightY() + "\t" + + trimBox.getLowerLeftX() + "\t"); } else { res.add("Page " + numero + " size:\t\t" - + Math.abs(cropBox.getWidth()) + " pts x " - + Math.abs(cropBox.getHeight()) + " pts"); + + Math.abs(cropBox.getWidth()) + " pts x " + + Math.abs(cropBox.getHeight()) + " pts"); // Boxes res.add("Page " + numero + " CropBox:\t" - + cropBox.getLowerLeftX() + "\t" - + cropBox.getUpperRightY() + "\t" - + cropBox.getUpperRightX() + "\t" - + cropBox.getLowerLeftY() + "\t"); + + cropBox.getLowerLeftX() + "\t" + + cropBox.getUpperRightY() + "\t" + + cropBox.getUpperRightX() + "\t" + + cropBox.getLowerLeftY() + "\t"); res.add("Page " + numero + " MediaBox:\t" - + mediaBox.getLowerLeftX() + "\t" - + mediaBox.getUpperRightY() + "\t" - + mediaBox.getUpperRightX() + "\t" - + mediaBox.getLowerLeftY() + "\t"); + + mediaBox.getLowerLeftX() + "\t" + + mediaBox.getUpperRightY() + "\t" + + mediaBox.getUpperRightX() + "\t" + + mediaBox.getLowerLeftY() + "\t"); res.add("Page " + numero + " TrimBox:\t" - + trimBox.getLowerLeftX() + "\t" - + trimBox.getUpperRightY() + "\t" - + trimBox.getUpperRightX() + "\t" - + trimBox.getLowerLeftY() + "\t"); + + trimBox.getLowerLeftX() + "\t" + + trimBox.getUpperRightY() + "\t" + + trimBox.getUpperRightX() + "\t" + + trimBox.getLowerLeftY() + "\t"); } } @@ -356,7 +360,7 @@ public class Main { } for (String s : res) { - System.out.println(s); + System.out.println(StringUtils.trim(s)); } return changes; @@ -392,17 +396,17 @@ public class Main { } } res.add("NumberSectionsDelimiters:\t\t" - + delimiters.substring(0, delimiters.length() - 1)); + + delimiters.substring(0, delimiters.length() - 1)); } private static void addBookmark(PDDocument doc, ArrayList res, PDOutlineNode bookmark, int level) throws IOException { PDOutlineItem current = bookmark.getFirstChild(); while (current != null) { - res.add("BookmarkTitle:\t\t" + current.getTitle().trim()); + res.add("BookmarkTitle:\t\t" + StringEscapeUtils.escapeHtml4(current.getTitle())); res.add("BookmarkLevel:\t\t" + level); res.add("BookmarkPage:\t\t" - + getPageFromAction(doc, current.getAction())); + + getPageFromAction(doc, current.getAction())); addBookmark(doc, res, current, level + 1); current = current.getNextSibling(); } @@ -410,10 +414,10 @@ public class Main { private static void cutDocument(PDDocument doc, String input, String output, String cutmode) throws COSVisitorException, - IOException { + IOException { System.out.println("Cut document of " + doc.getNumberOfPages() - + " with mode " + cutmode); + + " with mode " + cutmode); ArrayList copies = duplicatePages(doc, input, cutmode); cutPages(doc, cutmode); @@ -464,8 +468,8 @@ public class Main { newbox.move(decalage, 0f); System.out.println("Set cropbox of page " + page + " from " - + pdfPage.getCropBox() + " to " + newbox + " (offset : " - + decalage + ")"); + + pdfPage.getCropBox() + " to " + newbox + " (offset : " + + decalage + ")"); pdfPage.setCropBox(newbox); pdfPage.setMediaBox(newbox); @@ -519,7 +523,7 @@ public class Main { continue; } System.out.println("Duplicate page " + page + " :: cursor is at " - + cursor); + + cursor); // Duplicate page for (int j = 0; j < duplicateTime; j++) { List l = copies.get(j).getDocumentCatalog().getAllPages(); @@ -541,7 +545,7 @@ public class Main { private void extractTexts(PDDocument doc, String textsOutput, String method, Integer[] pages, String ignoredSeparators, String input) throws IOException, ClassNotFoundException, - NullPointerException, InterruptedException { + NullPointerException, InterruptedException { long s = Calendar.getInstance().getTimeInMillis(); @@ -551,7 +555,7 @@ public class Main { PDDocument d; int totalThreads = Math.max(1, - Math.min(Math.round(pages.length / 50.0f), Main.threads)); + Math.min(Math.round(pages.length / 50.0f), Main.threads)); System.out.println("Total threads " + totalThreads); @@ -577,13 +581,13 @@ public class Main { } System.out.println("Extraction des textes with " + method + " : " - + ((Calendar.getInstance().getTimeInMillis() - s) / 1000) - + "s"); + + ((Calendar.getInstance().getTimeInMillis() - s) / 1000) + + "s"); } public static void updateCropBox(PDDocument doc, String output, String refbox, Integer[] pages, String defined) throws IOException, - COSVisitorException { + COSVisitorException { System.out.println("updateCropBox"); if (!"".equals(defined)) { updateCropBoxDefined(doc, defined); @@ -596,7 +600,7 @@ public class Main { } private static void updateCropBoxDefined(PDDocument doc, String defined) - throws IOException, COSVisitorException { + throws IOException, COSVisitorException { String[] e = defined.split("*"); for (int i = 0; i < e.length; i++) { String[] e1 = e[i].split(","); @@ -674,7 +678,7 @@ public class Main { } public static void saveLinks(String file, ArrayList listLinks) - throws IOException { + throws IOException { FileIO out = new FileIO(file); out.open("w"); out.output.writeBytes(Link.header()); @@ -745,7 +749,7 @@ public class Main { } public static ArrayList extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p) - throws IOException { + throws IOException { System.out.println(pageNumber); ArrayList listLinks = new ArrayList<>(); Link myLink; @@ -804,7 +808,7 @@ public class Main { System.out.println(link.getRectangle().getHeight()); myLink.rect = link.getRectangle(); if (myLink.rect.getWidth() == 0.0 - || myLink.rect.getHeight() == 0.0) { + || myLink.rect.getHeight() == 0.0) { System.out.println("Skip link :: surface == 0"); continue; } diff --git a/src/PDF/Document.php b/src/PDF/Document.php index 7f1d39a..96d6691 100644 --- a/src/PDF/Document.php +++ b/src/PDF/Document.php @@ -89,7 +89,6 @@ class Document $lines = Text::explodeNewLines($infos); $bookmark_id = -1; - foreach ($lines as $line) { $line = trim($line); [$k, $v] = explode(':', $line); @@ -98,7 +97,7 @@ class Document if ($k === 'BookmarkTitle') { $bookmark_id++; - $this->chapters[$bookmark_id] = array('label' => str_replace(' ', '', $v)); + $this->chapters[$bookmark_id] = array('label' => html_entity_decode($v)); } elseif ($k === 'BookmarkLevel') { $this->chapters[$bookmark_id]['level'] = (int)$v - 1; } elseif ($k === 'BookmarkPage') { -- 2.39.5