wip #6188 @0.25

author Vincent Vanwaelscappel <vincent@cubedesigners.com>

Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)

committer Vincent Vanwaelscappel <vincent@cubedesigners.com>

Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)
author Vincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)
committer Vincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)
diff --git a/resources/tools/fwstk/.idea/workspace.xml b/resources/tools/fwstk/.idea/workspace.xml

index d20d359b4f7ba5a48b0325f6fb2969dec9a901e0..b9d43a9a7f56285d1b9ddb2f023f6360f68ab0f1 100644 (file)
--- a/resources/tools/fwstk/.idea/workspace.xml
+++ b/resources/tools/fwstk/.idea/workspace.xml
@@ -9,11 +9,7 @@
      <option name="autoReloadType" value="SELECTIVE" />
    </component>
    <component name="ChangeListManager">
-    <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410">
-      <change beforePath="$PROJECT_DIR$/../../../.idea/deployment.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../../../.idea/deployment.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../../.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../../../.idea/misc.xml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../../src/PDFTools.php" beforeDir="false" afterPath="$PROJECT_DIR$/../../../src/PDFTools.php" afterDir="false" />
-    </list>
+    <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410" />
      <option name="SHOW_DIALOG" value="false" />
      <option name="HIGHLIGHT_CONFLICTS" value="true" />
      <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -391,7 +387,7 @@
      <configuration name="extract texts" type="Application" factoryName="Application">
        <option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
        <module name="fwstk" />
-      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\original.pdf --extractTextsMethod pdfbox --extractTexts C:\Users\vince\Desktop\%s%d.txt --threads 1" />
+      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\20929.pdf --mode robust --extractTextsMethod fluidbook --extractTexts C:\Users\vince\Desktop\20929\%s%d.txt --threads 1" />
        <method v="2">
          <option name="Make" enabled="true" />
        </method>
@@ -489,7 +485,7 @@
        <workItem from="1692895286414" duration="2259000" />
        <workItem from="1692974658841" duration="8000" />
        <workItem from="1692974700537" duration="688000" />
-      <workItem from="1694090487471" duration="2806000" />
+      <workItem from="1694090487471" duration="6051000" />
      </task>
      <task id="LOCAL-00001" summary="wip #1111 @0.5">
        <created>1487172253077</created>
diff --git a/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class b/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class

index 15e5d0f251c32460cd8b77f4ef488c482b291d95..fbb40584d3f2888100541767723575c652ca0411 100644 (file)

Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class differ
diff --git a/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class b/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class

index 14d5162de98c9ffaf0de161426102ec40fd29029..44713c5e1a20c1fa6ae1783ba7a23a16cfa1361f 100644 (file)

Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class differ
diff --git a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class

index 98e65a3182256a03fa685e372650588a282432a3..880a680e9eb09117bb649c60d31950541b20bdcc 100644 (file)

Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class differ
diff --git a/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar b/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar

index 44662489275bb472ae21626b030eb5addcd5ffa4..d8f6b30517d336305f53d7e9d01d3cc5ef377f05 100644 (file)

Binary files a/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar and b/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar differ
diff --git a/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java b/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java

index b9b5d96a87180e615d13694e08a551ddba5982eb..5513ebabc5ade128b44eb838ca4292c1d05d5d55 100644 (file)
--- a/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java
+++ b/resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java
@@ -53,9 +53,7 @@ public class Main {
      static Float linkOffsetX = 0.0f;
      static Float linkOffsetY = 0.0f;
  
-    public static void main(String[] args) throws IOException,
-            COSVisitorException, ClassNotFoundException, NullPointerException,
-            InterruptedException {
+    public static void main(String[] args) throws IOException, COSVisitorException, ClassNotFoundException, NullPointerException, InterruptedException {
  
          if (args.length < 0) {
              printUsage();
@@ -68,12 +66,13 @@ public class Main {
          String defined = "";
          String linksOutput = "";
          String cutmode = "";
-        Boolean infos = false;
+        boolean infos = false;
          String textsOutput = "";
          String imageOutput = "";
          String layoutOutput = "";
          String ignoredSeparators = "";
          String textsExtractionMethod = "pdfbox";
+        boolean robust = false;
          PDDocument doc = null;
          Integer[] pages = null;
  
@@ -92,8 +91,7 @@ public class Main {
                  } else if (args[i].trim().compareTo("-v") == 0) {
                      printVersion();
                      return;
-                } else if (args[i].trim().compareTo("-h") == 0
-                        || args[i].trim().compareTo("--help") == 0) {
+                } else if (args[i].trim().compareTo("-h") == 0 || args[i].trim().compareTo("--help") == 0) {
                      printUsage();
                      return;
                  } else if (args[i].trim().compareTo("--trim") == 0) {
@@ -128,7 +126,7 @@ public class Main {
                      imageOutput = args[i].trim();
                  } else if (args[i].trim().compareTo("--ignoreSeparators") == 0) {
                      i++;
-                    ignoredSeparators = args[i].trim().replace("{SPACE}"," ");
+                    ignoredSeparators = args[i].trim().replace("{SPACE}", " ");
                  } else if (args[i].trim().compareTo("--linkOffsetX") == 0) {
                      i++;
                      linkOffsetX = Float.parseFloat(args[i].trim());
@@ -138,6 +136,9 @@ public class Main {
                  } else if (args[i].trim().compareTo("--threads") == 0) {
                      i++;
                      threads = Integer.parseInt(args[i].trim());
+                } else if (args[i].trim().compareTo("--mode") == 0) {
+                    i++;
+                    robust = args[i].trim() == "robust";
                  }
              }
  
@@ -188,14 +189,14 @@ public class Main {
                      doc = openDocument(input);
                  }
                  Main m = new Main();
-                m.extractTexts(doc, textsOutput, textsExtractionMethod, pages, ignoredSeparators, input);
+                m.extractTexts(doc, textsOutput, textsExtractionMethod, robust, pages, ignoredSeparators, input);
              }
              if (!"".equals(layoutOutput)) {
                  if (doc == null) {
                      doc = openDocument(input);
                  }
                  Main m = new Main();
-                m.extractLayout(doc, layoutOutput, ignoredSeparators);
+                m.extractLayout(doc, layoutOutput, robust, ignoredSeparators);
              }
              if (imageOutput.compareTo("") != 0) {
                  if (doc == null) {
@@ -227,7 +228,7 @@ public class Main {
          writer.writeImage(doc, "png", "", 1, doc.getNumberOfPages(), imageOutput);
      }
  
-    private void extractLayout(PDDocument doc, String layoutOutput, String ignoredSeparators) throws IOException {
+    private void extractLayout(PDDocument doc, String layoutOutput, boolean robust, String ignoredSeparators) throws IOException {
          LayoutStripper stripper = null;
  
          List pages = doc.getDocumentCatalog().getAllPages();
@@ -237,17 +238,14 @@ public class Main {
  
          while (pagesIter.hasNext()) {
              stripper = new LayoutStripper();
+            stripper.setRobust(robust);
              stripper.setIgnoredSeparators(ignoredSeparators);
              PDPage nextPage = (PDPage) pagesIter.next();
              i++;
              // For each page, one stripper, otherwise, there is bug with chars widths
              stripper.process(nextPage, i);
              Page layout = stripper.getLayout();
-            BufferedWriter out
-                    = new BufferedWriter(
-                    new OutputStreamWriter(
-                            new FileOutputStream(layoutOutput.replace("%d", ""
-                                    + i)), "UTF-8"));
+            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(layoutOutput.replace("%d", "" + i)), "UTF-8"));
              out.write(layout.asJSON());
              out.close();
          }
@@ -256,9 +254,7 @@ public class Main {
      private static Boolean getInfos(PDDocument doc) throws IOException, COSVisitorException {
          ArrayList<String> res = new ArrayList<>();
          // General informations
-        String[] fields = {"Author", "Title", "CreationDate", "Creator",
-                "Keywords", "ModificationDate", "Producer", "Subject",
-                "Trapped", "Version"};
+        String[] fields = {"Author", "Title", "CreationDate", "Creator", "Keywords", "ModificationDate", "Producer", "Subject", "Trapped", "Version"};
          PDDocumentInformation infos = doc.getDocumentInformation();
          for (String k : fields) {
              String v = infos.getCustomMetadataValue(k);
@@ -300,48 +296,19 @@ public class Main {
              }
  
              // Size
-            if (page.getRotation() != null
-                    && (page.getRotation() == 90 || page.getRotation() == 270)) {
-                res.add("Page " + numero + " size:\t\t"
-                        + Math.abs(cropBox.getHeight()) + " pts x "
-                        + Math.abs(cropBox.getWidth()) + " pts");
+            if (page.getRotation() != null && (page.getRotation() == 90 || page.getRotation() == 270)) {
+                res.add("Page " + numero + " size:\t\t" + Math.abs(cropBox.getHeight()) + " pts x " + Math.abs(cropBox.getWidth()) + " pts");
                  // Boxes
-                res.add("Page " + numero + " CropBox:\t"
-                        + cropBox.getLowerLeftY() + "\t"
-                        + cropBox.getUpperRightX() + "\t"
-                        + cropBox.getUpperRightY() + "\t"
-                        + cropBox.getLowerLeftX() + "\t");
-                res.add("Page " + numero + " MediaBox:\t"
-                        + mediaBox.getLowerLeftY() + "\t"
-                        + mediaBox.getUpperRightX() + "\t"
-                        + mediaBox.getUpperRightY() + "\t"
-                        + mediaBox.getLowerLeftX() + "\t");
-                res.add("Page " + numero + " TrimBox:\t"
-                        + trimBox.getLowerLeftY() + "\t"
-                        + trimBox.getUpperRightX() + "\t"
-                        + trimBox.getUpperRightY() + "\t"
-                        + trimBox.getLowerLeftX() + "\t");
+                res.add("Page " + numero + " CropBox:\t" + cropBox.getLowerLeftY() + "\t" + cropBox.getUpperRightX() + "\t" + cropBox.getUpperRightY() + "\t" + cropBox.getLowerLeftX() + "\t");
+                res.add("Page " + numero + " MediaBox:\t" + mediaBox.getLowerLeftY() + "\t" + mediaBox.getUpperRightX() + "\t" + mediaBox.getUpperRightY() + "\t" + mediaBox.getLowerLeftX() + "\t");
+                res.add("Page " + numero + " TrimBox:\t" + trimBox.getLowerLeftY() + "\t" + trimBox.getUpperRightX() + "\t" + trimBox.getUpperRightY() + "\t" + trimBox.getLowerLeftX() + "\t");
              } else {
  
-                res.add("Page " + numero + " size:\t\t"
-                        + Math.abs(cropBox.getWidth()) + " pts x "
-                        + Math.abs(cropBox.getHeight()) + " pts");
+                res.add("Page " + numero + " size:\t\t" + Math.abs(cropBox.getWidth()) + " pts x " + Math.abs(cropBox.getHeight()) + " pts");
                  // Boxes
-                res.add("Page " + numero + " CropBox:\t"
-                        + cropBox.getLowerLeftX() + "\t"
-                        + cropBox.getUpperRightY() + "\t"
-                        + cropBox.getUpperRightX() + "\t"
-                        + cropBox.getLowerLeftY() + "\t");
-                res.add("Page " + numero + " MediaBox:\t"
-                        + mediaBox.getLowerLeftX() + "\t"
-                        + mediaBox.getUpperRightY() + "\t"
-                        + mediaBox.getUpperRightX() + "\t"
-                        + mediaBox.getLowerLeftY() + "\t");
-                res.add("Page " + numero + " TrimBox:\t"
-                        + trimBox.getLowerLeftX() + "\t"
-                        + trimBox.getUpperRightY() + "\t"
-                        + trimBox.getUpperRightX() + "\t"
-                        + trimBox.getLowerLeftY() + "\t");
+                res.add("Page " + numero + " CropBox:\t" + cropBox.getLowerLeftX() + "\t" + cropBox.getUpperRightY() + "\t" + cropBox.getUpperRightX() + "\t" + cropBox.getLowerLeftY() + "\t");
+                res.add("Page " + numero + " MediaBox:\t" + mediaBox.getLowerLeftX() + "\t" + mediaBox.getUpperRightY() + "\t" + mediaBox.getUpperRightX() + "\t" + mediaBox.getLowerLeftY() + "\t");
+                res.add("Page " + numero + " TrimBox:\t" + trimBox.getLowerLeftX() + "\t" + trimBox.getUpperRightY() + "\t" + trimBox.getUpperRightX() + "\t" + trimBox.getLowerLeftY() + "\t");
              }
          }
  
@@ -364,8 +331,7 @@ public class Main {
          return changes;
      }
  
-    private static void addLabels(ArrayList<String> res, PDPageLabels labels,
-                                  int pages) {
+    private static void addLabels(ArrayList<String> res, PDPageLabels labels, int pages) {
  
          // Get the raw list
          String[] labelList = labels.getLabelsByPageIndices();
@@ -393,34 +359,28 @@ public class Main {
                  rangeRef = range;
              }
          }
-        res.add("NumberSectionsDelimiters:\t\t"
-                + delimiters.substring(0, delimiters.length() - 1));
+        res.add("NumberSectionsDelimiters:\t\t" + delimiters.substring(0, delimiters.length() - 1));
      }
  
-    private static void addBookmark(PDDocument doc, ArrayList<String> res,
-                                    PDOutlineNode bookmark, int level) throws IOException {
+    private static void addBookmark(PDDocument doc, ArrayList<String> res, PDOutlineNode bookmark, int level) throws IOException {
          PDOutlineItem current = bookmark.getFirstChild();
          while (current != null) {
              try {
  
                  res.add("BookmarkTitle:\t\t" + StringEscapeUtils.escapeHtml4(current.getTitle()));
                  res.add("BookmarkLevel:\t\t" + level);
-                res.add("BookmarkPage:\t\t"
-                        + getPageFromAction(doc, current.getAction()));
+                res.add("BookmarkPage:\t\t" + getPageFromAction(doc, current.getAction()));
                  addBookmark(doc, res, current, level + 1);
-            }catch (Exception e){
+            } catch (Exception e) {
  
              }
              current = current.getNextSibling();
          }
      }
  
-    private static void cutDocument(PDDocument doc, String input,
-                                    String output, String cutmode) throws COSVisitorException,
-            IOException {
+    private static void cutDocument(PDDocument doc, String input, String output, String cutmode) throws COSVisitorException, IOException {
  
-        System.out.println("Cut document of " + doc.getNumberOfPages()
-                + " with mode " + cutmode);
+        System.out.println("Cut document of " + doc.getNumberOfPages() + " with mode " + cutmode);
  
          ArrayList<PDDocument> copies = duplicatePages(doc, input, cutmode);
          cutPages(doc, cutmode);
@@ -470,17 +430,14 @@ public class Main {
              float decalage = ((float) decalageCrans) * w;
              newbox.move(decalage, 0f);
  
-            System.out.println("Set cropbox of page " + page + " from "
-                    + pdfPage.getCropBox() + " to " + newbox + " (offset : "
-                    + decalage + ")");
+            System.out.println("Set cropbox of page " + page + " from " + pdfPage.getCropBox() + " to " + newbox + " (offset : " + decalage + ")");
  
              pdfPage.setCropBox(newbox);
              pdfPage.setMediaBox(newbox);
          }
      }
  
-    private static ArrayList<PDDocument> duplicatePages(PDDocument doc,
-                                                        String input, String cutmode) throws IOException {
+    private static ArrayList<PDDocument> duplicatePages(PDDocument doc, String input, String cutmode) throws IOException {
  
          List<PDPage> pageList = doc.getDocumentCatalog().getAllPages();
          int originalSize = pageList.size();
@@ -525,8 +482,7 @@ public class Main {
                  System.out.println("Skip page " + page);
                  continue;
              }
-            System.out.println("Duplicate page " + page + " :: cursor is at "
-                    + cursor);
+            System.out.println("Duplicate page " + page + " :: cursor is at " + cursor);
              // Duplicate page
              for (int j = 0; j < duplicateTime; j++) {
                  List<PDPage> l = copies.get(j).getDocumentCatalog().getAllPages();
@@ -546,9 +502,7 @@ public class Main {
  
      }
  
-    private void extractTexts(PDDocument doc, String textsOutput, String method,
-                              Integer[] pages, String ignoredSeparators, String input) throws IOException, ClassNotFoundException,
-            NullPointerException, InterruptedException {
+    private void extractTexts(PDDocument doc, String textsOutput, String method, boolean robust, Integer[] pages, String ignoredSeparators, String input) throws IOException, ClassNotFoundException, NullPointerException, InterruptedException {
  
          long s = Calendar.getInstance().getTimeInMillis();
  
@@ -557,8 +511,7 @@ public class Main {
          PDDocument[] tdoc = new PDDocument[threads];
          PDDocument d;
  
-        int totalThreads = Math.max(1,
-                Math.min(Math.round(pages.length / 50.0f), Main.threads));
+        int totalThreads = Math.max(1, Math.min(Math.round(pages.length / 50.0f), Main.threads));
  
          System.out.println("Total threads " + totalThreads);
  
@@ -569,7 +522,7 @@ public class Main {
                  d = doc;
              }
  
-            TextsThread t = new TextsThread(d, 0.5f, textsOutput, method, pages, i, totalThreads, ignoredSeparators, input);
+            TextsThread t = new TextsThread(d, 0.5f, textsOutput, method, robust, pages, i, totalThreads, ignoredSeparators, input);
              t.setPriority(Thread.MIN_PRIORITY);
              t.start();
              tlist[i] = t;
@@ -583,14 +536,10 @@ public class Main {
              }
          }
  
-        System.out.println("Extraction des textes with " + method + " : "
-                + ((Calendar.getInstance().getTimeInMillis() - s) / 1000)
-                + "s");
+        System.out.println("Extraction des textes with " + method + " : " + ((Calendar.getInstance().getTimeInMillis() - s) / 1000) + "s");
      }
  
-    public static void updateCropBox(PDDocument doc, String output,
-                                     String refbox, Integer[] pages, String defined) throws IOException,
-            COSVisitorException {
+    public static void updateCropBox(PDDocument doc, String output, String refbox, Integer[] pages, String defined) throws IOException, COSVisitorException {
          System.out.println("updateCropBox");
          if (!"".equals(defined)) {
              updateCropBoxDefined(doc, defined);
@@ -602,8 +551,7 @@ public class Main {
          return;
      }
  
-    private static void updateCropBoxDefined(PDDocument doc, String defined)
-            throws IOException, COSVisitorException {
+    private static void updateCropBoxDefined(PDDocument doc, String defined) throws IOException, COSVisitorException {
          String[] e = defined.split("*");
          for (int i = 0; i < e.length; i++) {
              String[] e1 = e[i].split(",");
@@ -650,8 +598,7 @@ public class Main {
          return box;
      }
  
-    public static void extractLinks(PDDocument doc, String linksOutput,
-                                    Integer[] pages) throws IOException {
+    public static void extractLinks(PDDocument doc, String linksOutput, Integer[] pages) throws IOException {
  
          List<PDPage> pageList = doc.getDocumentCatalog().getAllPages();
  
@@ -672,16 +619,14 @@ public class Main {
          }
      }
  
-    private static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p,
-                                                      String file) throws IOException {
+    private static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p, String file) throws IOException {
          ArrayList<Link> listLinks = extractLinksOfPage(doc, pageNumber, p);
          saveLinks(file.replaceFirst("%d", "" + pageNumber), listLinks);
  
          return listLinks;
      }
  
-    public static void saveLinks(String file, ArrayList<Link> listLinks)
-            throws IOException {
+    public static void saveLinks(String file, ArrayList<Link> listLinks) throws IOException {
          FileIO out = new FileIO(file);
          out.open("w");
          out.output.writeBytes(Link.header());
@@ -692,7 +637,7 @@ public class Main {
      }
  
      public static String getPageFromAction(PDDocument doc, PDAction a) throws IOException {
-        if(a instanceof PDActionRemoteGoTo){
+        if (a instanceof PDActionRemoteGoTo) {
              return "-1";
          }
          PDActionGoTo aa = (PDActionGoTo) a;
@@ -754,8 +699,7 @@ public class Main {
          return "-1";
      }
  
-    public static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p)
-            throws IOException {
+    public static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p) throws IOException {
          System.out.println(pageNumber);
          ArrayList<Link> listLinks = new ArrayList<>();
          Link myLink;
@@ -813,8 +757,7 @@ public class Main {
  
                  System.out.println(link.getRectangle().getHeight());
                  myLink.rect = link.getRectangle();
-                if (myLink.rect.getWidth() == 0.0
-                        || myLink.rect.getHeight() == 0.0) {
+                if (myLink.rect.getWidth() == 0.0 || myLink.rect.getHeight() == 0.0) {
                      System.out.println("Skip link :: surface == 0");
                      continue;
                  }
diff --git a/resources/tools/fwstk/src/com/fluidbook/fwstk/TextsThread.java b/resources/tools/fwstk/src/com/fluidbook/fwstk/TextsThread.java

index 6543836c645acf2131fc0159a8c9bd95853c0f1e..3eefe219b75766663483757b434d71213dbea51f 100644 (file)
--- a/resources/tools/fwstk/src/com/fluidbook/fwstk/TextsThread.java
+++ b/resources/tools/fwstk/src/com/fluidbook/fwstk/TextsThread.java
@@ -15,251 +15,255 @@ import org.apache.pdfbox.pdmodel.PDPage;
  
  public class TextsThread extends Thread {
  
-       protected PDDocument doc;
-       protected int threadIndex;
-       protected String textsOutput;
-       protected Integer[] pages;
-       protected int totalThreads;
-       protected float tolerance;
-       protected String ignoredSeparators = "";
-       protected String docURL;
-       protected String[] trimchars;
-       protected String method;
-
-       TextsThread(PDDocument doc, float tolerance, String textsOutput, String method,
-                   Integer[] pages, int index, int totalThreads, String ignoredSeparators, String docURL) {
-               this.doc = doc;
-               this.threadIndex = index;
-               this.textsOutput = textsOutput;
-               this.pages = pages;
-               this.totalThreads = totalThreads;
-               this.tolerance = tolerance;
-               this.ignoredSeparators = ignoredSeparators;
-               this.docURL = docURL;
-               this.method = method;
-
-               trimchars = new String[1];
-               trimchars[0] = " ";
-
-       }
-
-       @Override
-       public void run() {
-               String index;
-               String pindex;
-
-               String memoHTML = "";
-               String html = "";
-               String phtml = "";
-
-               String text = "";
-               String ptext = "";
-
-               Boolean sortByPosition = false;
-               Boolean separateByBeads = false;
-               Boolean suppressDuplicate = true;
-
-               for (Integer i : pages) {
-                       if (i % totalThreads != threadIndex) {
-                               continue;
-                       }
-
-                       System.out.println("Parsing page " + i+" with "+method);
-                       String file = textsOutput.replaceFirst("%d", "" + i);
-
-                       // Poppler
-                       BufferedWriter out;
-
-                       if ("poppler".equals(method)) {
-                               try {
-                                       Process proc;
-                                       proc = Runtime.getRuntime().exec("pdftotext -f " + i + " -l " + i + " -enc UTF-8 -eol unix -nopgbrk " + this.docURL + " - ");
-
-                                       InputStream output = proc.getInputStream();
-                                       StringWriter writer = new StringWriter();
-                                       IOUtils.copy(output, writer, "UTF-8");
-                                       phtml = writer.toString();
-                                       phtml = cleanhtml(phtml);
-
-                                       ptext = html2text(phtml);
-                                       pindex = text2index(ptext);
-                                       ptext = StringUtil.removeAccents(ptext);
-
-                                       String ppfile = file.replaceFirst("%s", "pp");
-                                       String pifile = file.replaceFirst("%s", "pi");
-                                       String phfile = file.replaceFirst("%s", "ph");
-
-                                       try {
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(ppfile), "UTF8"));
-                                               out.write(ptext);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(pifile), "UTF8"));
-                                               out.write(pindex);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(phfile), "UTF8"));
-                                               out.write(phtml);
-                                               out.close();
-                                       } catch (UnsupportedEncodingException e) {
-                                               e.printStackTrace();
-                                       } catch (FileNotFoundException e) {
-                                               e.printStackTrace();
-                                       } catch (IOException e) {
-                                               e.printStackTrace();
-                                       }
-
-                                       phtml = "";
-                                       pindex = "";
-                                       ptext = "";
-                                       html = "";
-                                       text = "";
-                                       index = "";
-
-                               } catch (IOException ex) {
-                                       ex.printStackTrace();
-                               }
-                       } else if ("pdfbox".equals(method)) {
-
-                               /// PDFBOX
-
-                               try {
-                                       CustomStripper htmlStripper;
-                                       htmlStripper = new CustomStripper("UTF-8");
-
-                                       htmlStripper.setSortByPosition(sortByPosition);
-                                       htmlStripper.setShouldSeparateByBeads(separateByBeads);
-                                       htmlStripper.setAverageCharTolerance(tolerance);
-                                       htmlStripper.setSpacingTolerance(tolerance);
-                                       htmlStripper.setStartPage(i);
-                                       htmlStripper.setEndPage(i);
-                                       htmlStripper.setSuppressDuplicateOverlappingText(suppressDuplicate);
-                                       html = htmlStripper.getHTML(doc);
-                                       html = cleanhtml(html);
-
-                                       text = htmlStripper.getText(doc);
-                                       text = html2text(text);
-                                       index = text2index(text);
-
-                                       text = StringUtil.removeAccents(text);
-                                       String pfile = file.replaceFirst("%s", "p");
-                                       String ifile = file.replaceFirst("%s", "i");
-                                       String hfile = file.replaceFirst("%s", "h");
-
-                                       try {
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(pfile), "UTF8"));
-                                               out.write(text);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(ifile), "UTF8"));
-                                               out.write(index);
-                                               out.close();
-
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(hfile), "UTF8"));
-                                               out.write(html);
-                                               out.close();
-                                       } catch (UnsupportedEncodingException e) {
-
-                                               e.printStackTrace();
-                                       } catch (FileNotFoundException e) {
-                                               e.printStackTrace();
-                                       } catch (IOException e) {
-                                               e.printStackTrace();
-                                       }
-                               } catch (IOException ex) {
-                                       ex.printStackTrace();
-                               }
-
-                               memoHTML = html;
-                       } else if ("fluidbook".equals(method)) {
-                               // Fluidbook
-                               try {
-                                       LayoutStripper layoutStripper;
-                                       layoutStripper = new LayoutStripper();
-                                       layoutStripper.setIgnoredSeparators(ignoredSeparators);
-                                       layoutStripper.process((PDPage) doc.getDocumentCatalog().getAllPages().get(i - 1), i);
-
-                                       String fbhtml = layoutStripper.getLayout().asHTML();
-                                       String fbtext = layoutStripper.getLayout().asText();
-
-                                       String fpfile = file.replaceFirst("%s", "fp");
-                                       String fifile = file.replaceFirst("%s", "fi");
-                                       String fhfile = file.replaceFirst("%s", "fh");
-
-                                       fbtext = html2text(fbtext);
-
-                                       String fbindex = text2index(fbtext);
-
-                                       try {
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(fpfile), "UTF8"));
-                                               out.write(fbtext);
-                                               out.close();
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(fifile), "UTF8"));
-                                               out.write(fbindex);
-                                               out.close();
-                                               out = new BufferedWriter(new OutputStreamWriter(
-                                                               new FileOutputStream(fhfile), "UTF8"));
-                                               out.write(fbhtml);
-                                               out.close();
-                                       } catch (UnsupportedEncodingException e) {
-                                               e.printStackTrace();
-                                       } catch (FileNotFoundException e) {
-                                               e.printStackTrace();
-                                       }
-                               } catch (Exception ex) {
-                                       ex.printStackTrace();
-                               }
-                       }
-               }
-       }
-
-       protected String html2text(String text) {
-               text = StringUtil.separateLigatures(text);
-               text = text.toLowerCase();
-               text = StringUtil.removeControl(text);
-               text = StringUtil.removePoints(text, ignoredSeparators);
-               text = StringUtil.condenseWhite(text);
-               text = StringUtil.trim(text, trimchars);
-               return text;
-       }
-
-       protected String cleanhtml(String html) {
-               return StringUtil.normalizeWhite(html);
-       }
-
-       protected String text2index(String text) {
-               TreeMap<String, Word> index = new TreeMap<>();
-
-               String[] words = text.split(" ");
-               for (String word : words) {
-                       String woa = StringUtil.removeAccents(word);
-                       if (index.containsKey(woa)) {
-                               index.get(woa).addWord(word);
-                       } else {
-                               Word w = new Word();
-                               w.addWord(word);
-                               index.put(woa, w);
-                       }
-               }
-               String indexString = "";
-
-               for (Entry<String, Word> e : index.entrySet()) {
-                       indexString += e.getKey() + "," + e.getValue().toString()
-                                       + "\n";
-               }
-               if (indexString.length() > 0) {
-                       indexString = indexString.substring(0,
-                                       indexString.length() - 1);
-               }
-
-               return indexString;
-       }
+    protected PDDocument doc;
+    protected int threadIndex;
+    protected String textsOutput;
+    protected Integer[] pages;
+    protected int totalThreads;
+    protected float tolerance;
+    protected String ignoredSeparators = "";
+    protected String docURL;
+    protected String[] trimchars;
+    protected String method;
+
+    protected boolean robust;
+
+    TextsThread(PDDocument doc, float tolerance, String textsOutput, String method, boolean robust,
+                Integer[] pages, int index, int totalThreads, String ignoredSeparators, String docURL) {
+        this.doc = doc;
+        this.threadIndex = index;
+        this.textsOutput = textsOutput;
+        this.pages = pages;
+        this.totalThreads = totalThreads;
+        this.tolerance = tolerance;
+        this.ignoredSeparators = ignoredSeparators;
+        this.docURL = docURL;
+        this.method = method;
+        this.robust = robust;
+
+        trimchars = new String[1];
+        trimchars[0] = " ";
+
+    }
+
+    @Override
+    public void run() {
+        String index;
+        String pindex;
+
+        String memoHTML = "";
+        String html = "";
+        String phtml = "";
+
+        String text = "";
+        String ptext = "";
+
+        Boolean sortByPosition = false;
+        Boolean separateByBeads = false;
+        Boolean suppressDuplicate = true;
+
+        for (Integer i : pages) {
+            if (i % totalThreads != threadIndex) {
+                continue;
+            }
+
+            System.out.println("Parsing page " + i + " with " + method);
+            String file = textsOutput.replaceFirst("%d", "" + i);
+
+            // Poppler
+            BufferedWriter out;
+
+            if ("poppler".equals(method)) {
+                try {
+                    Process proc;
+                    proc = Runtime.getRuntime().exec("pdftotext -f " + i + " -l " + i + " -enc UTF-8 -eol unix -nopgbrk " + this.docURL + " - ");
+
+                    InputStream output = proc.getInputStream();
+                    StringWriter writer = new StringWriter();
+                    IOUtils.copy(output, writer, "UTF-8");
+                    phtml = writer.toString();
+                    phtml = cleanhtml(phtml);
+
+                    ptext = html2text(phtml);
+                    pindex = text2index(ptext);
+                    ptext = StringUtil.removeAccents(ptext);
+
+                    String ppfile = file.replaceFirst("%s", "pp");
+                    String pifile = file.replaceFirst("%s", "pi");
+                    String phfile = file.replaceFirst("%s", "ph");
+
+                    try {
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(ppfile), "UTF8"));
+                        out.write(ptext);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(pifile), "UTF8"));
+                        out.write(pindex);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(phfile), "UTF8"));
+                        out.write(phtml);
+                        out.close();
+                    } catch (UnsupportedEncodingException e) {
+                        e.printStackTrace();
+                    } catch (FileNotFoundException e) {
+                        e.printStackTrace();
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+
+                    phtml = "";
+                    pindex = "";
+                    ptext = "";
+                    html = "";
+                    text = "";
+                    index = "";
+
+                } catch (IOException ex) {
+                    ex.printStackTrace();
+                }
+            } else if ("pdfbox".equals(method)) {
+
+                /// PDFBOX
+
+                try {
+                    CustomStripper htmlStripper;
+                    htmlStripper = new CustomStripper("UTF-8");
+
+                    htmlStripper.setSortByPosition(sortByPosition);
+                    htmlStripper.setShouldSeparateByBeads(separateByBeads);
+                    htmlStripper.setAverageCharTolerance(tolerance);
+                    htmlStripper.setSpacingTolerance(tolerance);
+                    htmlStripper.setStartPage(i);
+                    htmlStripper.setEndPage(i);
+                    htmlStripper.setSuppressDuplicateOverlappingText(suppressDuplicate);
+                    html = htmlStripper.getHTML(doc);
+                    html = cleanhtml(html);
+
+                    text = htmlStripper.getText(doc);
+                    text = html2text(text);
+                    index = text2index(text);
+
+                    text = StringUtil.removeAccents(text);
+                    String pfile = file.replaceFirst("%s", "p");
+                    String ifile = file.replaceFirst("%s", "i");
+                    String hfile = file.replaceFirst("%s", "h");
+
+                    try {
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(pfile), "UTF8"));
+                        out.write(text);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(ifile), "UTF8"));
+                        out.write(index);
+                        out.close();
+
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(hfile), "UTF8"));
+                        out.write(html);
+                        out.close();
+                    } catch (UnsupportedEncodingException e) {
+
+                        e.printStackTrace();
+                    } catch (FileNotFoundException e) {
+                        e.printStackTrace();
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                } catch (IOException ex) {
+                    ex.printStackTrace();
+                }
+
+                memoHTML = html;
+            } else if ("fluidbook".equals(method)) {
+                // Fluidbook
+                try {
+                    LayoutStripper layoutStripper;
+                    layoutStripper = new LayoutStripper();
+                    layoutStripper.setRobust(robust);
+                    layoutStripper.setIgnoredSeparators(ignoredSeparators);
+                    layoutStripper.process((PDPage) doc.getDocumentCatalog().getAllPages().get(i - 1), i);
+
+                    String fbhtml = layoutStripper.getLayout().asHTML();
+                    String fbtext = layoutStripper.getLayout().asText();
+
+                    String fpfile = file.replaceFirst("%s", "fp");
+                    String fifile = file.replaceFirst("%s", "fi");
+                    String fhfile = file.replaceFirst("%s", "fh");
+
+                    fbtext = html2text(fbtext);
+
+                    String fbindex = text2index(fbtext);
+
+                    try {
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(fpfile), "UTF8"));
+                        out.write(fbtext);
+                        out.close();
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(fifile), "UTF8"));
+                        out.write(fbindex);
+                        out.close();
+                        out = new BufferedWriter(new OutputStreamWriter(
+                                new FileOutputStream(fhfile), "UTF8"));
+                        out.write(fbhtml);
+                        out.close();
+                    } catch (UnsupportedEncodingException e) {
+                        e.printStackTrace();
+                    } catch (FileNotFoundException e) {
+                        e.printStackTrace();
+                    }
+                } catch (Exception ex) {
+                    ex.printStackTrace();
+                }
+            }
+        }
+    }
+
+    protected String html2text(String text) {
+        text = StringUtil.separateLigatures(text);
+        text = text.toLowerCase();
+        text = StringUtil.removeControl(text);
+        text = StringUtil.removePoints(text, ignoredSeparators);
+        text = StringUtil.condenseWhite(text);
+        text = StringUtil.trim(text, trimchars);
+        return text;
+    }
+
+    protected String cleanhtml(String html) {
+        return StringUtil.normalizeWhite(html);
+    }
+
+    protected String text2index(String text) {
+        TreeMap<String, Word> index = new TreeMap<>();
+
+        String[] words = text.split(" ");
+        for (String word : words) {
+            String woa = StringUtil.removeAccents(word);
+            if (index.containsKey(woa)) {
+                index.get(woa).addWord(word);
+            } else {
+                Word w = new Word();
+                w.addWord(word);
+                index.put(woa, w);
+            }
+        }
+        String indexString = "";
+
+        for (Entry<String, Word> e : index.entrySet()) {
+            indexString += e.getKey() + "," + e.getValue().toString()
+                    + "\n";
+        }
+        if (indexString.length() > 0) {
+            indexString = indexString.substring(0,
+                    indexString.length() - 1);
+        }
+
+        return indexString;
+    }
  }
diff --git a/resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java b/resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java

index 73a1d152e775bf48b05cd59545d298283113b7a5..d9b5ad69c7857ff69f1407a38f12a5521fe82a5d 100644 (file)
--- a/resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java
+++ b/resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java
@@ -18,6 +18,7 @@ public class LayoutStripper extends PDFStreamEngine {
  
      protected PDPage currentPage;
      protected String ignoredSeparators;
+    protected boolean robust = false;
      public Page layout;
  
      public LayoutStripper() throws IOException {
@@ -73,12 +74,16 @@ public class LayoutStripper extends PDFStreamEngine {
  
      protected String normalizeSpace(String c) {
          int code = c.codePointAt(0);
-        if (code == 8201 || code <= 8 ) {
+        if (code == 8201 || code <= 8) {
              return " ";
          }
          return c;
      }
  
+    public void setRobust(boolean robust) {
+        this.robust = robust;
+    }
+
      public Page getLayout() {
          return layout;
      }
author	Vincent Vanwaelscappel <vincent@cubedesigners.com>
	Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)
committer	Vincent Vanwaelscappel <vincent@cubedesigners.com>
	Thu, 7 Sep 2023 15:59:20 +0000 (17:59 +0200)
resources/tools/fwstk/.idea/workspace.xml		patch \| blob \| history
resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class		patch \| blob \| history
resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class		patch \| blob \| history
resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class		patch \| blob \| history
resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar		patch \| blob \| history
resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java		patch \| blob \| history
resources/tools/fwstk/src/com/fluidbook/fwstk/TextsThread.java		patch \| blob \| history
resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java		patch \| blob \| history