]> _ Git - cubist_pdf.git/commitdiff
wip #6188 @0.25
authorVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)
committerVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)
resources/tools/fwstk/.idea/workspace.xml
resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class
resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar
resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java
resources/tools/fwstk/src/com/fluidbook/fwstk/TextsThread.java
resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java

index d20d359b4f7ba5a48b0325f6fb2969dec9a901e0..b9d43a9a7f56285d1b9ddb2f023f6360f68ab0f1 100644 (file)
@@ -9,11 +9,7 @@
     <option name="autoReloadType" value="SELECTIVE" />
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410">
-      <change beforePath="$PROJECT_DIR$/../../../.idea/deployment.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../../../.idea/deployment.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../../.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../../../.idea/misc.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../../src/PDFTools.php" beforeDir="false" afterPath="$PROJECT_DIR$/../../../src/PDFTools.php" afterDir="false" />
-    </list>
+    <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
     <configuration name="extract texts" type="Application" factoryName="Application">
       <option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
       <module name="fwstk" />
-      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\original.pdf --extractTextsMethod pdfbox --extractTexts C:\Users\vince\Desktop\%s%d.txt --threads 1" />
+      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\20929.pdf --mode robust --extractTextsMethod fluidbook --extractTexts C:\Users\vince\Desktop\20929\%s%d.txt --threads 1" />
       <method v="2">
         <option name="Make" enabled="true" />
       </method>
       <workItem from="1692895286414" duration="2259000" />
       <workItem from="1692974658841" duration="8000" />
       <workItem from="1692974700537" duration="688000" />
-      <workItem from="1694090487471" duration="2806000" />
+      <workItem from="1694090487471" duration="6051000" />
     </task>
     <task id="LOCAL-00001" summary="wip #1111 @0.5">
       <created>1487172253077</created>
index 15e5d0f251c32460cd8b77f4ef488c482b291d95..fbb40584d3f2888100541767723575c652ca0411 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class differ
index 14d5162de98c9ffaf0de161426102ec40fd29029..44713c5e1a20c1fa6ae1783ba7a23a16cfa1361f 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class differ
index 98e65a3182256a03fa685e372650588a282432a3..880a680e9eb09117bb649c60d31950541b20bdcc 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class differ
index 44662489275bb472ae21626b030eb5addcd5ffa4..d8f6b30517d336305f53d7e9d01d3cc5ef377f05 100644 (file)
Binary files a/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar and b/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar differ
index b9b5d96a87180e615d13694e08a551ddba5982eb..5513ebabc5ade128b44eb838ca4292c1d05d5d55 100644 (file)
@@ -53,9 +53,7 @@ public class Main {
     static Float linkOffsetX = 0.0f;
     static Float linkOffsetY = 0.0f;
 
-    public static void main(String[] args) throws IOException,
-            COSVisitorException, ClassNotFoundException, NullPointerException,
-            InterruptedException {
+    public static void main(String[] args) throws IOException, COSVisitorException, ClassNotFoundException, NullPointerException, InterruptedException {
 
         if (args.length < 0) {
             printUsage();
@@ -68,12 +66,13 @@ public class Main {
         String defined = "";
         String linksOutput = "";
         String cutmode = "";
-        Boolean infos = false;
+        boolean infos = false;
         String textsOutput = "";
         String imageOutput = "";
         String layoutOutput = "";
         String ignoredSeparators = "";
         String textsExtractionMethod = "pdfbox";
+        boolean robust = false;
         PDDocument doc = null;
         Integer[] pages = null;
 
@@ -92,8 +91,7 @@ public class Main {
                 } else if (args[i].trim().compareTo("-v") == 0) {
                     printVersion();
                     return;
-                } else if (args[i].trim().compareTo("-h") == 0
-                        || args[i].trim().compareTo("--help") == 0) {
+                } else if (args[i].trim().compareTo("-h") == 0 || args[i].trim().compareTo("--help") == 0) {
                     printUsage();
                     return;
                 } else if (args[i].trim().compareTo("--trim") == 0) {
@@ -128,7 +126,7 @@ public class Main {
                     imageOutput = args[i].trim();
                 } else if (args[i].trim().compareTo("--ignoreSeparators") == 0) {
                     i++;
-                    ignoredSeparators = args[i].trim().replace("{SPACE}"," ");
+                    ignoredSeparators = args[i].trim().replace("{SPACE}", " ");
                 } else if (args[i].trim().compareTo("--linkOffsetX") == 0) {
                     i++;
                     linkOffsetX = Float.parseFloat(args[i].trim());
@@ -138,6 +136,9 @@ public class Main {
                 } else if (args[i].trim().compareTo("--threads") == 0) {
                     i++;
                     threads = Integer.parseInt(args[i].trim());
+                } else if (args[i].trim().compareTo("--mode") == 0) {
+                    i++;
+                    robust = args[i].trim() == "robust";
                 }
             }
 
@@ -188,14 +189,14 @@ public class Main {
                     doc = openDocument(input);
                 }
                 Main m = new Main();
-                m.extractTexts(doc, textsOutput, textsExtractionMethod, pages, ignoredSeparators, input);
+                m.extractTexts(doc, textsOutput, textsExtractionMethod, robust, pages, ignoredSeparators, input);
             }
             if (!"".equals(layoutOutput)) {
                 if (doc == null) {
                     doc = openDocument(input);
                 }
                 Main m = new Main();
-                m.extractLayout(doc, layoutOutput, ignoredSeparators);
+                m.extractLayout(doc, layoutOutput, robust, ignoredSeparators);
             }
             if (imageOutput.compareTo("") != 0) {
                 if (doc == null) {
@@ -227,7 +228,7 @@ public class Main {
         writer.writeImage(doc, "png", "", 1, doc.getNumberOfPages(), imageOutput);
     }
 
-    private void extractLayout(PDDocument doc, String layoutOutput, String ignoredSeparators) throws IOException {
+    private void extractLayout(PDDocument doc, String layoutOutput, boolean robust, String ignoredSeparators) throws IOException {
         LayoutStripper stripper = null;
 
         List pages = doc.getDocumentCatalog().getAllPages();
@@ -237,17 +238,14 @@ public class Main {
 
         while (pagesIter.hasNext()) {
             stripper = new LayoutStripper();
+            stripper.setRobust(robust);
             stripper.setIgnoredSeparators(ignoredSeparators);
             PDPage nextPage = (PDPage) pagesIter.next();
             i++;
             // For each page, one stripper, otherwise, there is bug with chars widths
             stripper.process(nextPage, i);
             Page layout = stripper.getLayout();
-            BufferedWriter out
-                    = new BufferedWriter(
-                    new OutputStreamWriter(
-                            new FileOutputStream(layoutOutput.replace("%d", ""
-                                    + i)), "UTF-8"));
+            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(layoutOutput.replace("%d", "" + i)), "UTF-8"));
             out.write(layout.asJSON());
             out.close();
         }
@@ -256,9 +254,7 @@ public class Main {
     private static Boolean getInfos(PDDocument doc) throws IOException, COSVisitorException {
         ArrayList<String> res = new ArrayList<>();
         // General informations
-        String[] fields = {"Author", "Title", "CreationDate", "Creator",
-                "Keywords", "ModificationDate", "Producer", "Subject",
-                "Trapped", "Version"};
+        String[] fields = {"Author", "Title", "CreationDate", "Creator", "Keywords", "ModificationDate", "Producer", "Subject", "Trapped", "Version"};
         PDDocumentInformation infos = doc.getDocumentInformation();
         for (String k : fields) {
             String v = infos.getCustomMetadataValue(k);
@@ -300,48 +296,19 @@ public class Main {
             }
 
             // Size
-            if (page.getRotation() != null
-                    && (page.getRotation() == 90 || page.getRotation() == 270)) {
-                res.add("Page " + numero + " size:\t\t"
-                        + Math.abs(cropBox.getHeight()) + " pts x "
-                        + Math.abs(cropBox.getWidth()) + " pts");
+            if (page.getRotation() != null && (page.getRotation() == 90 || page.getRotation() == 270)) {
+                res.add("Page " + numero + " size:\t\t" + Math.abs(cropBox.getHeight()) + " pts x " + Math.abs(cropBox.getWidth()) + " pts");
                 // Boxes
-                res.add("Page " + numero + " CropBox:\t"
-                        + cropBox.getLowerLeftY() + "\t"
-                        + cropBox.getUpperRightX() + "\t"
-                        + cropBox.getUpperRightY() + "\t"
-                        + cropBox.getLowerLeftX() + "\t");
-                res.add("Page " + numero + " MediaBox:\t"
-                        + mediaBox.getLowerLeftY() + "\t"
-                        + mediaBox.getUpperRightX() + "\t"
-                        + mediaBox.getUpperRightY() + "\t"
-                        + mediaBox.getLowerLeftX() + "\t");
-                res.add("Page " + numero + " TrimBox:\t"
-                        + trimBox.getLowerLeftY() + "\t"
-                        + trimBox.getUpperRightX() + "\t"
-                        + trimBox.getUpperRightY() + "\t"
-                        + trimBox.getLowerLeftX() + "\t");
+                res.add("Page " + numero + " CropBox:\t" + cropBox.getLowerLeftY() + "\t" + cropBox.getUpperRightX() + "\t" + cropBox.getUpperRightY() + "\t" + cropBox.getLowerLeftX() + "\t");
+                res.add("Page " + numero + " MediaBox:\t" + mediaBox.getLowerLeftY() + "\t" + mediaBox.getUpperRightX() + "\t" + mediaBox.getUpperRightY() + "\t" + mediaBox.getLowerLeftX() + "\t");
+                res.add("Page " + numero + " TrimBox:\t" + trimBox.getLowerLeftY() + "\t" + trimBox.getUpperRightX() + "\t" + trimBox.getUpperRightY() + "\t" + trimBox.getLowerLeftX() + "\t");
             } else {
 
-                res.add("Page " + numero + " size:\t\t"
-                        + Math.abs(cropBox.getWidth()) + " pts x "
-                        + Math.abs(cropBox.getHeight()) + " pts");
+                res.add("Page " + numero + " size:\t\t" + Math.abs(cropBox.getWidth()) + " pts x " + Math.abs(cropBox.getHeight()) + " pts");
                 // Boxes
-                res.add("Page " + numero + " CropBox:\t"
-                        + cropBox.getLowerLeftX() + "\t"
-                        + cropBox.getUpperRightY() + "\t"
-                        + cropBox.getUpperRightX() + "\t"
-                        + cropBox.getLowerLeftY() + "\t");
-                res.add("Page " + numero + " MediaBox:\t"
-                        + mediaBox.getLowerLeftX() + "\t"
-                        + mediaBox.getUpperRightY() + "\t"
-                        + mediaBox.getUpperRightX() + "\t"
-                        + mediaBox.getLowerLeftY() + "\t");
-                res.add("Page " + numero + " TrimBox:\t"
-                        + trimBox.getLowerLeftX() + "\t"
-                        + trimBox.getUpperRightY() + "\t"
-                        + trimBox.getUpperRightX() + "\t"
-                        + trimBox.getLowerLeftY() + "\t");
+                res.add("Page " + numero + " CropBox:\t" + cropBox.getLowerLeftX() + "\t" + cropBox.getUpperRightY() + "\t" + cropBox.getUpperRightX() + "\t" + cropBox.getLowerLeftY() + "\t");
+                res.add("Page " + numero + " MediaBox:\t" + mediaBox.getLowerLeftX() + "\t" + mediaBox.getUpperRightY() + "\t" + mediaBox.getUpperRightX() + "\t" + mediaBox.getLowerLeftY() + "\t");
+                res.add("Page " + numero + " TrimBox:\t" + trimBox.getLowerLeftX() + "\t" + trimBox.getUpperRightY() + "\t" + trimBox.getUpperRightX() + "\t" + trimBox.getLowerLeftY() + "\t");
             }
         }
 
@@ -364,8 +331,7 @@ public class Main {
         return changes;
     }
 
-    private static void addLabels(ArrayList<String> res, PDPageLabels labels,
-                                  int pages) {
+    private static void addLabels(ArrayList<String> res, PDPageLabels labels, int pages) {
 
         // Get the raw list
         String[] labelList = labels.getLabelsByPageIndices();
@@ -393,34 +359,28 @@ public class Main {
                 rangeRef = range;
             }
         }
-        res.add("NumberSectionsDelimiters:\t\t"
-                + delimiters.substring(0, delimiters.length() - 1));
+        res.add("NumberSectionsDelimiters:\t\t" + delimiters.substring(0, delimiters.length() - 1));
     }
 
-    private static void addBookmark(PDDocument doc, ArrayList<String> res,
-                                    PDOutlineNode bookmark, int level) throws IOException {
+    private static void addBookmark(PDDocument doc, ArrayList<String> res, PDOutlineNode bookmark, int level) throws IOException {
         PDOutlineItem current = bookmark.getFirstChild();
         while (current != null) {
             try {
 
                 res.add("BookmarkTitle:\t\t" + StringEscapeUtils.escapeHtml4(current.getTitle()));
                 res.add("BookmarkLevel:\t\t" + level);
-                res.add("BookmarkPage:\t\t"
-                        + getPageFromAction(doc, current.getAction()));
+                res.add("BookmarkPage:\t\t" + getPageFromAction(doc, current.getAction()));
                 addBookmark(doc, res, current, level + 1);
-            }catch (Exception e){
+            } catch (Exception e) {
 
             }
             current = current.getNextSibling();
         }
     }
 
-    private static void cutDocument(PDDocument doc, String input,
-                                    String output, String cutmode) throws COSVisitorException,
-            IOException {
+    private static void cutDocument(PDDocument doc, String input, String output, String cutmode) throws COSVisitorException, IOException {
 
-        System.out.println("Cut document of " + doc.getNumberOfPages()
-                + " with mode " + cutmode);
+        System.out.println("Cut document of " + doc.getNumberOfPages() + " with mode " + cutmode);
 
         ArrayList<PDDocument> copies = duplicatePages(doc, input, cutmode);
         cutPages(doc, cutmode);
@@ -470,17 +430,14 @@ public class Main {
             float decalage = ((float) decalageCrans) * w;
             newbox.move(decalage, 0f);
 
-            System.out.println("Set cropbox of page " + page + " from "
-                    + pdfPage.getCropBox() + " to " + newbox + " (offset : "
-                    + decalage + ")");
+            System.out.println("Set cropbox of page " + page + " from " + pdfPage.getCropBox() + " to " + newbox + " (offset : " + decalage + ")");
 
             pdfPage.setCropBox(newbox);
             pdfPage.setMediaBox(newbox);
         }
     }
 
-    private static ArrayList<PDDocument> duplicatePages(PDDocument doc,
-                                                        String input, String cutmode) throws IOException {
+    private static ArrayList<PDDocument> duplicatePages(PDDocument doc, String input, String cutmode) throws IOException {
 
         List<PDPage> pageList = doc.getDocumentCatalog().getAllPages();
         int originalSize = pageList.size();
@@ -525,8 +482,7 @@ public class Main {
                 System.out.println("Skip page " + page);
                 continue;
             }
-            System.out.println("Duplicate page " + page + " :: cursor is at "
-                    + cursor);
+            System.out.println("Duplicate page " + page + " :: cursor is at " + cursor);
             // Duplicate page
             for (int j = 0; j < duplicateTime; j++) {
                 List<PDPage> l = copies.get(j).getDocumentCatalog().getAllPages();
@@ -546,9 +502,7 @@ public class Main {
 
     }
 
-    private void extractTexts(PDDocument doc, String textsOutput, String method,
-                              Integer[] pages, String ignoredSeparators, String input) throws IOException, ClassNotFoundException,
-            NullPointerException, InterruptedException {
+    private void extractTexts(PDDocument doc, String textsOutput, String method, boolean robust, Integer[] pages, String ignoredSeparators, String input) throws IOException, ClassNotFoundException, NullPointerException, InterruptedException {
 
         long s = Calendar.getInstance().getTimeInMillis();
 
@@ -557,8 +511,7 @@ public class Main {
         PDDocument[] tdoc = new PDDocument[threads];
         PDDocument d;
 
-        int totalThreads = Math.max(1,
-                Math.min(Math.round(pages.length / 50.0f), Main.threads));
+        int totalThreads = Math.max(1, Math.min(Math.round(pages.length / 50.0f), Main.threads));
 
         System.out.println("Total threads " + totalThreads);
 
@@ -569,7 +522,7 @@ public class Main {
                 d = doc;
             }
 
-            TextsThread t = new TextsThread(d, 0.5f, textsOutput, method, pages, i, totalThreads, ignoredSeparators, input);
+            TextsThread t = new TextsThread(d, 0.5f, textsOutput, method, robust, pages, i, totalThreads, ignoredSeparators, input);
             t.setPriority(Thread.MIN_PRIORITY);
             t.start();
             tlist[i] = t;
@@ -583,14 +536,10 @@ public class Main {
             }
         }
 
-        System.out.println("Extraction des textes with " + method + " : "
-                + ((Calendar.getInstance().getTimeInMillis() - s) / 1000)
-                + "s");
+        System.out.println("Extraction des textes with " + method + " : " + ((Calendar.getInstance().getTimeInMillis() - s) / 1000) + "s");
     }
 
-    public static void updateCropBox(PDDocument doc, String output,
-                                     String refbox, Integer[] pages, String defined) throws IOException,
-            COSVisitorException {
+    public static void updateCropBox(PDDocument doc, String output, String refbox, Integer[] pages, String defined) throws IOException, COSVisitorException {
         System.out.println("updateCropBox");
         if (!"".equals(defined)) {
             updateCropBoxDefined(doc, defined);
@@ -602,8 +551,7 @@ public class Main {
         return;
     }
 
-    private static void updateCropBoxDefined(PDDocument doc, String defined)
-            throws IOException, COSVisitorException {
+    private static void updateCropBoxDefined(PDDocument doc, String defined) throws IOException, COSVisitorException {
         String[] e = defined.split("*");
         for (int i = 0; i < e.length; i++) {
             String[] e1 = e[i].split(",");
@@ -650,8 +598,7 @@ public class Main {
         return box;
     }
 
-    public static void extractLinks(PDDocument doc, String linksOutput,
-                                    Integer[] pages) throws IOException {
+    public static void extractLinks(PDDocument doc, String linksOutput, Integer[] pages) throws IOException {
 
         List<PDPage> pageList = doc.getDocumentCatalog().getAllPages();
 
@@ -672,16 +619,14 @@ public class Main {
         }
     }
 
-    private static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p,
-                                                      String file) throws IOException {
+    private static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p, String file) throws IOException {
         ArrayList<Link> listLinks = extractLinksOfPage(doc, pageNumber, p);
         saveLinks(file.replaceFirst("%d", "" + pageNumber), listLinks);
 
         return listLinks;
     }
 
-    public static void saveLinks(String file, ArrayList<Link> listLinks)
-            throws IOException {
+    public static void saveLinks(String file, ArrayList<Link> listLinks) throws IOException {
         FileIO out = new FileIO(file);
         out.open("w");
         out.output.writeBytes(Link.header());
@@ -692,7 +637,7 @@ public class Main {
     }
 
     public static String getPageFromAction(PDDocument doc, PDAction a) throws IOException {
-        if(a instanceof PDActionRemoteGoTo){
+        if (a instanceof PDActionRemoteGoTo) {
             return "-1";
         }
         PDActionGoTo aa = (PDActionGoTo) a;
@@ -754,8 +699,7 @@ public class Main {
         return "-1";
     }
 
-    public static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p)
-            throws IOException {
+    public static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p) throws IOException {
         System.out.println(pageNumber);
         ArrayList<Link> listLinks = new ArrayList<>();
         Link myLink;
@@ -813,8 +757,7 @@ public class Main {
 
                 System.out.println(link.getRectangle().getHeight());
                 myLink.rect = link.getRectangle();
-                if (myLink.rect.getWidth() == 0.0
-                        || myLink.rect.getHeight() == 0.0) {
+                if (myLink.rect.getWidth() == 0.0 || myLink.rect.getHeight() == 0.0) {
                     System.out.println("Skip link :: surface == 0");
                     continue;
                 }
index 6543836c645acf2131fc0159a8c9bd95853c0f1e..3eefe219b75766663483757b434d71213dbea51f 100644 (file)
@@ -15,251 +15,255 @@ import org.apache.pdfbox.pdmodel.PDPage;
 
 public class TextsThread extends Thread {
 
-       protected PDDocument doc;
-       protected int threadIndex;
-       protected String textsOutput;
-       protected Integer[] pages;
-       protected int totalThreads;
-       protected float tolerance;
-       protected String ignoredSeparators = "";
-       protected String docURL;
-       protected String[] trimchars;
-       protected String method;
-
-       TextsThread(PDDocument doc, float tolerance, String textsOutput, String method,
-                   Integer[] pages, int index, int totalThreads, String ignoredSeparators, String docURL) {
-               this.doc = doc;
-               this.threadIndex = index;
-               this.textsOutput = textsOutput;
-               this.pages = pages;
-               this.totalThreads = totalThreads;
-               this.tolerance = tolerance;
-               this.ignoredSeparators = ignoredSeparators;
-               this.docURL = docURL;
-               this.method = method;
-
-               trimchars = new String[1];
-               trimchars[0] = " ";
-
-       }
-
-       @Override
-       public void run() {
-               String index;
-               String pindex;
-
-               String memoHTML = "";
-               String html = "";
-               String phtml = "";
-
-               String text = "";
-               String ptext = "";
-
-               Boolean sortByPosition = false;
-               Boolean separateByBeads = false;
-               Boolean suppressDuplicate = true;
-
-               for (Integer i : pages) {
-                       if (i % totalThreads != threadIndex) {
-                               continue;
-                       }
-
-                       System.out.println("Parsing page " + i+" with "+method);
-                       String file = textsOutput.replaceFirst("%d", "" + i);
-
-                       // Poppler
-                       BufferedWriter out;
-
-                       if ("poppler".equals(method)) {
-                               try {
-                                       Process proc;
-                                       proc = Runtime.getRuntime().exec("pdftotext -f " + i + " -l " + i + " -enc UTF-8 -eol unix -nopgbrk " + this.docURL + " - ");
-
-                                       InputStream output = proc.getInputStream();
-                                       StringWriter writer = new StringWriter();
-                                       IOUtils.copy(output, writer, "UTF-8");
-                                       phtml = writer.toString();
-                                       phtml = cleanhtml(phtml);
-
-                                       ptext = html2text(phtml);
-                                       pindex = text2index(ptext);
-                                       ptext = StringUtil.removeAccents(ptext);
-
-                                       String ppfile = file.replaceFirst("%s", "pp");
-                                       String pifile = file.replaceFirst("%s", "pi");
-                                       String phfile = file.replaceFirst("%s", "ph");
-
-                                       try {
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(ppfile), "UTF8"));
-                                               out.write(ptext);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(pifile), "UTF8"));
-                                               out.write(pindex);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(phfile), "UTF8"));
-                                               out.write(phtml);
-                                               out.close();
-                                       } catch (UnsupportedEncodingException e) {
-                                               e.printStackTrace();
-                                       } catch (FileNotFoundException e) {
-                                               e.printStackTrace();
-                                       } catch (IOException e) {
-                                               e.printStackTrace();
-                                       }
-
-                                       phtml = "";
-                                       pindex = "";
-                                       ptext = "";
-                                       html = "";
-                                       text = "";
-                                       index = "";
-
-                               } catch (IOException ex) {
-                                       ex.printStackTrace();
-                               }
-                       } else if ("pdfbox".equals(method)) {
-
-                               /// PDFBOX
-
-                               try {
-                                       CustomStripper htmlStripper;
-                                       htmlStripper = new CustomStripper("UTF-8");
-
-                                       htmlStripper.setSortByPosition(sortByPosition);
-                                       htmlStripper.setShouldSeparateByBeads(separateByBeads);
-                                       htmlStripper.setAverageCharTolerance(tolerance);
-                                       htmlStripper.setSpacingTolerance(tolerance);
-                                       htmlStripper.setStartPage(i);
-                                       htmlStripper.setEndPage(i);
-                                       htmlStripper.setSuppressDuplicateOverlappingText(suppressDuplicate);
-                                       html = htmlStripper.getHTML(doc);
-                                       html = cleanhtml(html);
-
-                                       text = htmlStripper.getText(doc);
-                                       text = html2text(text);
-                                       index = text2index(text);
-
-                                       text = StringUtil.removeAccents(text);
-                                       String pfile = file.replaceFirst("%s", "p");
-                                       String ifile = file.replaceFirst("%s", "i");
-                                       String hfile = file.replaceFirst("%s", "h");
-
-                                       try {
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(pfile), "UTF8"));
-                                               out.write(text);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(ifile), "UTF8"));
-                                               out.write(index);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(hfile), "UTF8"));
-                                               out.write(html);
-                                               out.close();
-                                       } catch (UnsupportedEncodingException e) {
-
-                                               e.printStackTrace();
-                                       } catch (FileNotFoundException e) {
-                                               e.printStackTrace();
-                                       } catch (IOException e) {
-                                               e.printStackTrace();
-                                       }
-                               } catch (IOException ex) {
-                                       ex.printStackTrace();
-                               }
-
-                               memoHTML = html;
-                       } else if ("fluidbook".equals(method)) {
-                               // Fluidbook
-                               try {
-                                       LayoutStripper layoutStripper;
-                                       layoutStripper = new LayoutStripper();
-                                       layoutStripper.setIgnoredSeparators(ignoredSeparators);
-                                       layoutStripper.process((PDPage) doc.getDocumentCatalog().getAllPages().get(i - 1), i);
-
-                                       String fbhtml = layoutStripper.getLayout().asHTML();
-                                       String fbtext = layoutStripper.getLayout().asText();
-
-                                       String fpfile = file.replaceFirst("%s", "fp");
-                                       String fifile = file.replaceFirst("%s", "fi");
-                                       String fhfile = file.replaceFirst("%s", "fh");
-
-                                       fbtext = html2text(fbtext);
-
-                                       String fbindex = text2index(fbtext);
-
-                                       try {
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(fpfile), "UTF8"));
-                                               out.write(fbtext);
-                                               out.close();
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(fifile), "UTF8"));
-                                               out.write(fbindex);
-                                               out.close();
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(fhfile), "UTF8"));
-                                               out.write(fbhtml);
-                                               out.close();
-                                       } catch (UnsupportedEncodingException e) {
-                                               e.printStackTrace();
-                                       } catch (FileNotFoundException e) {
-                                               e.printStackTrace();
-                                       }
-                               } catch (Exception ex) {
-                                       ex.printStackTrace();
-                               }
-                       }
-               }
-       }
-
-       protected String html2text(String text) {
-               text = StringUtil.separateLigatures(text);
-               text = text.toLowerCase();
-               text = StringUtil.removeControl(text);
-               text = StringUtil.removePoints(text, ignoredSeparators);
-               text = StringUtil.condenseWhite(text);
-               text = StringUtil.trim(text, trimchars);
-               return text;
-       }
-
-       protected String cleanhtml(String html) {
-               return StringUtil.normalizeWhite(html);
-       }
-
-       protected String text2index(String text) {
-               TreeMap<String, Word> index = new TreeMap<>();
-
-               String[] words = text.split(" ");
-               for (String word : words) {
-                       String woa = StringUtil.removeAccents(word);
-                       if (index.containsKey(woa)) {
-                               index.get(woa).addWord(word);
-                       } else {
-                               Word w = new Word();
-                               w.addWord(word);
-                               index.put(woa, w);
-                       }
-               }
-               String indexString = "";
-
-               for (Entry<String, Word> e : index.entrySet()) {
-                       indexString += e.getKey() + "," + e.getValue().toString()
-                                       + "\n";
-               }
-               if (indexString.length() > 0) {
-                       indexString = indexString.substring(0,
-                                       indexString.length() - 1);
-               }
-
-               return indexString;
-       }
+    protected PDDocument doc;
+    protected int threadIndex;
+    protected String textsOutput;
+    protected Integer[] pages;
+    protected int totalThreads;
+    protected float tolerance;
+    protected String ignoredSeparators = "";
+    protected String docURL;
+    protected String[] trimchars;
+    protected String method;
+
+    protected boolean robust;
+
+    TextsThread(PDDocument doc, float tolerance, String textsOutput, String method, boolean robust,
+                Integer[] pages, int index, int totalThreads, String ignoredSeparators, String docURL) {
+        this.doc = doc;
+        this.threadIndex = index;
+        this.textsOutput = textsOutput;
+        this.pages = pages;
+        this.totalThreads = totalThreads;
+        this.tolerance = tolerance;
+        this.ignoredSeparators = ignoredSeparators;
+        this.docURL = docURL;
+        this.method = method;
+        this.robust = robust;
+
+        trimchars = new String[1];
+        trimchars[0] = " ";
+
+    }
+
+    @Override
+    public void run() {
+        String index;
+        String pindex;
+
+        String memoHTML = "";
+        String html = "";
+        String phtml = "";
+
+        String text = "";
+        String ptext = "";
+
+        Boolean sortByPosition = false;
+        Boolean separateByBeads = false;
+        Boolean suppressDuplicate = true;
+
+        for (Integer i : pages) {
+            if (i % totalThreads != threadIndex) {
+                continue;
+            }
+
+            System.out.println("Parsing page " + i + " with " + method);
+            String file = textsOutput.replaceFirst("%d", "" + i);
+
+            // Poppler
+            BufferedWriter out;
+
+            if ("poppler".equals(method)) {
+                try {
+                    Process proc;
+                    proc = Runtime.getRuntime().exec("pdftotext -f " + i + " -l " + i + " -enc UTF-8 -eol unix -nopgbrk " + this.docURL + " - ");
+
+                    InputStream output = proc.getInputStream();
+                    StringWriter writer = new StringWriter();
+                    IOUtils.copy(output, writer, "UTF-8");
+                    phtml = writer.toString();
+                    phtml = cleanhtml(phtml);
+
+                    ptext = html2text(phtml);
+                    pindex = text2index(ptext);
+                    ptext = StringUtil.removeAccents(ptext);
+
+                    String ppfile = file.replaceFirst("%s", "pp");
+                    String pifile = file.replaceFirst("%s", "pi");
+                    String phfile = file.replaceFirst("%s", "ph");
+
+                    try {
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(ppfile), "UTF8"));
+                        out.write(ptext);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(pifile), "UTF8"));
+                        out.write(pindex);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(phfile), "UTF8"));
+                        out.write(phtml);
+                        out.close();
+                    } catch (UnsupportedEncodingException e) {
+                        e.printStackTrace();
+                    } catch (FileNotFoundException e) {
+                        e.printStackTrace();
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+
+                    phtml = "";
+                    pindex = "";
+                    ptext = "";
+                    html = "";
+                    text = "";
+                    index = "";
+
+                } catch (IOException ex) {
+                    ex.printStackTrace();
+                }
+            } else if ("pdfbox".equals(method)) {
+
+                /// PDFBOX
+
+                try {
+                    CustomStripper htmlStripper;
+                    htmlStripper = new CustomStripper("UTF-8");
+
+                    htmlStripper.setSortByPosition(sortByPosition);
+                    htmlStripper.setShouldSeparateByBeads(separateByBeads);
+                    htmlStripper.setAverageCharTolerance(tolerance);
+                    htmlStripper.setSpacingTolerance(tolerance);
+                    htmlStripper.setStartPage(i);
+                    htmlStripper.setEndPage(i);
+                    htmlStripper.setSuppressDuplicateOverlappingText(suppressDuplicate);
+                    html = htmlStripper.getHTML(doc);
+                    html = cleanhtml(html);
+
+                    text = htmlStripper.getText(doc);
+                    text = html2text(text);
+                    index = text2index(text);
+
+                    text = StringUtil.removeAccents(text);
+                    String pfile = file.replaceFirst("%s", "p");
+                    String ifile = file.replaceFirst("%s", "i");
+                    String hfile = file.replaceFirst("%s", "h");
+
+                    try {
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(pfile), "UTF8"));
+                        out.write(text);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(ifile), "UTF8"));
+                        out.write(index);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(hfile), "UTF8"));
+                        out.write(html);
+                        out.close();
+                    } catch (UnsupportedEncodingException e) {
+
+                        e.printStackTrace();
+                    } catch (FileNotFoundException e) {
+                        e.printStackTrace();
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                } catch (IOException ex) {
+                    ex.printStackTrace();
+                }
+
+                memoHTML = html;
+            } else if ("fluidbook".equals(method)) {
+                // Fluidbook
+                try {
+                    LayoutStripper layoutStripper;
+                    layoutStripper = new LayoutStripper();
+                    layoutStripper.setRobust(robust);
+                    layoutStripper.setIgnoredSeparators(ignoredSeparators);
+                    layoutStripper.process((PDPage) doc.getDocumentCatalog().getAllPages().get(i - 1), i);
+
+                    String fbhtml = layoutStripper.getLayout().asHTML();
+                    String fbtext = layoutStripper.getLayout().asText();
+
+                    String fpfile = file.replaceFirst("%s", "fp");
+                    String fifile = file.replaceFirst("%s", "fi");
+                    String fhfile = file.replaceFirst("%s", "fh");
+
+                    fbtext = html2text(fbtext);
+
+                    String fbindex = text2index(fbtext);
+
+                    try {
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(fpfile), "UTF8"));
+                        out.write(fbtext);
+                        out.close();
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(fifile), "UTF8"));
+                        out.write(fbindex);
+                        out.close();
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(fhfile), "UTF8"));
+                        out.write(fbhtml);
+                        out.close();
+                    } catch (UnsupportedEncodingException e) {
+                        e.printStackTrace();
+                    } catch (FileNotFoundException e) {
+                        e.printStackTrace();
+                    }
+                } catch (Exception ex) {
+                    ex.printStackTrace();
+                }
+            }
+        }
+    }
+
+    protected String html2text(String text) {
+        text = StringUtil.separateLigatures(text);
+        text = text.toLowerCase();
+        text = StringUtil.removeControl(text);
+        text = StringUtil.removePoints(text, ignoredSeparators);
+        text = StringUtil.condenseWhite(text);
+        text = StringUtil.trim(text, trimchars);
+        return text;
+    }
+
+    protected String cleanhtml(String html) {
+        return StringUtil.normalizeWhite(html);
+    }
+
+    protected String text2index(String text) {
+        TreeMap<String, Word> index = new TreeMap<>();
+
+        String[] words = text.split(" ");
+        for (String word : words) {
+            String woa = StringUtil.removeAccents(word);
+            if (index.containsKey(woa)) {
+                index.get(woa).addWord(word);
+            } else {
+                Word w = new Word();
+                w.addWord(word);
+                index.put(woa, w);
+            }
+        }
+        String indexString = "";
+
+        for (Entry<String, Word> e : index.entrySet()) {
+            indexString += e.getKey() + "," + e.getValue().toString()
+                    + "\n";
+        }
+        if (indexString.length() > 0) {
+            indexString = indexString.substring(0,
+                    indexString.length() - 1);
+        }
+
+        return indexString;
+    }
 }
index 73a1d152e775bf48b05cd59545d298283113b7a5..d9b5ad69c7857ff69f1407a38f12a5521fe82a5d 100644 (file)
@@ -18,6 +18,7 @@ public class LayoutStripper extends PDFStreamEngine {
 
     protected PDPage currentPage;
     protected String ignoredSeparators;
+    protected boolean robust = false;
     public Page layout;
 
     public LayoutStripper() throws IOException {
@@ -73,12 +74,16 @@ public class LayoutStripper extends PDFStreamEngine {
 
     protected String normalizeSpace(String c) {
         int code = c.codePointAt(0);
-        if (code == 8201 || code <= 8 ) {
+        if (code == 8201 || code <= 8) {
             return " ";
         }
         return c;
     }
 
+    public void setRobust(boolean robust) {
+        this.robust = robust;
+    }
+
     public Page getLayout() {
         return layout;
     }