]> _ Git - cubist_pdf.git/commitdiff
wip #6188 @1
authorVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 14 Sep 2023 07:08:25 +0000 (09:08 +0200)
committerVincent Vanwaelscappel <vincent@cubedesigners.com>
Thu, 14 Sep 2023 07:08:25 +0000 (09:08 +0200)
15 files changed:
resources/tools/fwstk/.idea/workspace.xml
resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Group.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Line.class
resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Page.class
resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar
resources/tools/fwstk/src/com/fluidbook/fwstk/Main.java
resources/tools/fwstk/src/com/fluidbook/fwstk/TextsThread.java
resources/tools/fwstk/src/com/fluidbook/fwstk/layout/Group.java
resources/tools/fwstk/src/com/fluidbook/fwstk/layout/LayoutStripper.java
resources/tools/fwstk/src/com/fluidbook/fwstk/layout/Line.java
resources/tools/fwstk/src/com/fluidbook/fwstk/layout/Page.java
src/PDFTools.php

index a5c00fffe4bffeed9127b976aa2afed69fffd9db..1f9e7c3cd571a55e11a1e157ea62d3ec12325dc6 100644 (file)
@@ -9,7 +9,9 @@
     <option name="autoReloadType" value="SELECTIVE" />
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410" />
+    <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410">
+      <change beforePath="$PROJECT_DIR$/../../../src/PDFTools.php" beforeDir="false" afterPath="$PROJECT_DIR$/../../../src/PDFTools.php" afterDir="false" />
+    </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
       <recent name="H:\Works\cubeExtranet\fluidbook\tools\fwstk" />
     </key>
   </component>
-  <component name="RunManager" selected="Application.extract texts">
+  <component name="RunManager" selected="Application.extract layout">
     <configuration default="true" type="Applet">
       <option name="POLICY_FILE" value="$APPLICATION_HOME_DIR$/bin/appletviewer.policy" />
       <method v="2">
     <configuration name="extract layout" type="Application" factoryName="Application">
       <option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
       <module name="fwstk" />
-      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\Vincent\Desktop\original.pdf --layout C:\Users\Vincent\Desktop\loutres\p%d.fby --threads 1" />
-      <option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$" />
+      <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\20929.pdf --mode robust --layout C:\Users\vince\Desktop\20929\p%d.fby --threads 1" />
       <method v="2">
         <option name="Make" enabled="true" />
       </method>
       <workItem from="1694157597994" duration="9552000" />
       <workItem from="1694187452028" duration="14000" />
       <workItem from="1694189811041" duration="768000" />
-      <workItem from="1694545035743" duration="798000" />
+      <workItem from="1694545035743" duration="1398000" />
+      <workItem from="1694674242867" duration="907000" />
     </task>
     <task id="LOCAL-00001" summary="wip #1111 @0.5">
       <created>1487172253077</created>
index 92e526bd3d0b9526a70a73c707cc00eb84734118..79aeae6eb82f8eb42a3d5a13783712e8988fd94f 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/Main.class differ
index c17afc75efcadd0b0a2f29d42642836fc412ded5..d1cf2001effc04c54afe25964416318c742512c9 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/TextsThread.class differ
index d93bc2122497fc2c2ffbb219d923017159082122..cf354198130c8d152f8a8aaac6d1950669fa77ec 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Group.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Group.class differ
index 880a680e9eb09117bb649c60d31950541b20bdcc..12ba9c479678dc0f8720605ceabbfdf42ae309d4 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/LayoutStripper.class differ
index 4415f5cc934a9870778b866bf8c92982f3f38da4..be9157e6b0c877acc45d20a88ced547603814b2a 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Line.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Line.class differ
index 054c1c377381e2d9a4c0ebc48ba623919d9a4e77..9a8c8187ade3c3856f315c4cccdd020dfa1d0c64 100644 (file)
Binary files a/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Page.class and b/resources/tools/fwstk/bin/com/fluidbook/fwstk/layout/Page.class differ
index 4fe3b9fa3ba61bfba9e9cc6d80693a0e6169064f..72b59930214c8420eeaca9c6f8930bc87b4e1c20 100644 (file)
Binary files a/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar and b/resources/tools/fwstk/out/artifacts/fwstk_jar/fwstk.jar differ
index e5fe176961f65af7754749f4c8869622df14d169..9fb35e13e1d9c6281324b6e0bdc32be687462e8a 100644 (file)
@@ -238,7 +238,7 @@ public class Main {
 
         while (pagesIter.hasNext()) {
             stripper = new LayoutStripper();
-            stripper.setRobust(robust);
+            stripper.setSplitAllChars(robust);
             stripper.setIgnoredSeparators(ignoredSeparators);
             PDPage nextPage = (PDPage) pagesIter.next();
             i++;
index 0c3de1c614f88581efe75dac5da62c9768bf9af7..77b4323afb76a47737eef08b01b68d0fb8853350 100644 (file)
@@ -188,7 +188,6 @@ public class TextsThread extends Thread {
                 try {
                     LayoutStripper layoutStripper;
                     layoutStripper = new LayoutStripper();
-                    layoutStripper.setRobust(this.robust);
                     layoutStripper.setIgnoredSeparators(ignoredSeparators);
                     layoutStripper.process((PDPage) doc.getDocumentCatalog().getAllPages().get(i - 1), i);
 
@@ -200,7 +199,6 @@ public class TextsThread extends Thread {
                     String fhfile = file.replaceFirst("%s", "fh");
 
                     fbtext = html2text(fbtext);
-                    System.out.println("this.robust :: "+this.robust);
                     if (this.robust) {
                         fbtext=StringUtil.removeSpaces(fbtext);
                     }
index e35311d06141c16530b69e803cbbcda354d3619b..6e4c96ec20ef94ed73f6d21faaf1a64fa5e3d7bf 100644 (file)
@@ -15,138 +15,139 @@ import java.util.LinkedList;
  */
 public class Group extends LayoutElement {
 
-       LinkedList<Word> words;
-       float size;
-       float rotation;
-       protected float spaceWidth;
-       protected Word currentWord;
-       protected LinkedList<Word> condensedWords;
-       protected LinkedList<Letter> letters;
-       protected Boolean wordsMade;
-       String ignoredSeparators;
-
-       public Group(float size, float spaceWidth, String ignoredSeparators, float rotation) {
-               this.size = size;
-               this.rotation = rotation;
-
-               this.wordsMade = false;
-
-               this.ignoredSeparators = ignoredSeparators;
-
-               this.spaceWidth = spaceWidth;
-               this.words = new LinkedList<>();
-               this.letters = new LinkedList<>();
-       }
-
-       public String asJSON(PDRectangle cropbox, float y, float scaleX, float scaleY, float rotation) {
-               makeWords();
-
-               ArrayList<String> res = new ArrayList<String>();
-               for (Word word : words) {
-                       if (word.isSeparator()) {
-                               continue;
-                       }
-                       String json = word.asJSON(cropbox, y, scaleX, scaleY, rotation);
-                       if ("".equals(json)) {
-                               continue;
-                       }
-                       res.add(json);
-               }
-               return String.join(",", res);
-       }
-
-       public String asText(PDRectangle cropbox) {
-               makeWords();
-               ArrayList<String> res = new ArrayList<String>();
-               for (Word word : words) {
-                       String text = word.asText(cropbox);
-                       if ("".equals(text)) {
-                               continue;
-                       }
-                       res.add(text);
-               }
-               return String.join("", res);
-       }
-
-       public boolean equals(float size) {
-               return size == this.size;
-       }
-
-       public void addText(float x, float y, float width, float height, String text) {
-               Letter added;
-
-               if (" ".equals(text)) {
-                       added = new Space(x, y, width, height);
-               } else if (text.matches(StringUtil.separatorsRegexp(ignoredSeparators))) {
-                       added = new Separator(text, x, y, width, height);
-               } else {
-                       added = new Letter(text, x, y, width, height);
-               }
-               letters.add(added);
-       }
-
-       protected void makeWords() {
-               if (this.wordsMade) {
-                       return;
-               }
-               this.wordsMade = true;
-               Word currentWord = new Word();
-
-               for (int i = 0; i < letters.size(); i++) {
-                       Letter l = letters.get(i);
-                       Boolean good = currentWord.goodCandidate(l, spaceWidth, rotation);
-                       if (good) {
-                               // Si la lettre est un bon candidat
-                               currentWord.addLetter(l);
-                       } else {
-                               // Sinon, on clôt le mot,
-                               words.add(currentWord);
-                               // On en recrée un nouveau
-                               currentWord = new Word();
-                               // Et on ajoute la lettre
-                               currentWord.addLetter(l);
-                               if (l.isSeparator()) {
-                                       // Si le mot ajouté est un espace, on ferme
-                                       // le mot directement
-                                       words.add(currentWord);
-                                       currentWord = new Word();
-                               }
-                       }
-               }
-
-               words.add(currentWord);
-               addMissingSpaces();
-       }
-
-       protected void addMissingSpaces() {
-               LinkedList<Word> wordsWithSpaces = new LinkedList<>();
-
-               for (int i = 0; i < words.size(); i++) {
-                       Word w = words.get(i);
-                       Word future = null;
-                       if (i + 1 < words.size()) {
-                               future = words.get(i + 1);
-                       }
-
-                       wordsWithSpaces.add(w);
-
-                       if (!w.isSeparator() && future != null && !future.isSeparator()) {
-                               // Si le mot en cours n'est pas un espace,
-                               // quel mot précédent n'en était pas un 
-                               // et que le mot suivant existe
-
-                               // On ajoute un espace à la liste des mots
-                               spaceWidth = future.startX() - w.nextPosition();
-                               if (spaceWidth == 0.0f) {
-                                       continue;
-                               }
-
-                               Space sp = new Space(w.nextPosition(), 0.0f, spaceWidth, 1.0f);
-                               Word spw = new Word();
-                               spw.addLetter(sp);
-                               wordsWithSpaces.add(spw);
-                       }
-               }
-               words = wordsWithSpaces;
-       }
+    LinkedList<Word> words;
+    float size;
+    float rotation;
+    protected float spaceWidth;
+    protected Word currentWord;
+    protected LinkedList<Word> condensedWords;
+    protected LinkedList<Letter> letters;
+    protected Boolean wordsMade;
+    String ignoredSeparators;
+    protected boolean splitAllChars = false;
+
+    public Group(float size, float spaceWidth, boolean splitAllChars, String ignoredSeparators, float rotation) {
+        this.size = size;
+        this.rotation = rotation;
+
+        this.wordsMade = false;
+
+        this.splitAllChars = splitAllChars;
+        this.ignoredSeparators = ignoredSeparators;
+
+        this.spaceWidth = spaceWidth;
+        this.words = new LinkedList<>();
+        this.letters = new LinkedList<>();
+    }
+
+    public String asJSON(PDRectangle cropbox, float y, float scaleX, float scaleY, float rotation) {
+        makeWords();
+
+        ArrayList<String> res = new ArrayList<String>();
+        for (Word word : words) {
+            if (word.isSeparator()) {
+                continue;
+            }
+            String json = word.asJSON(cropbox, y, scaleX, scaleY, rotation);
+            if ("".equals(json)) {
+                continue;
+            }
+            res.add(json);
+        }
+        return String.join(",", res);
+    }
+
+    public String asText(PDRectangle cropbox) {
+        makeWords();
+        ArrayList<String> res = new ArrayList<String>();
+        for (Word word : words) {
+            String text = word.asText(cropbox);
+            if ("".equals(text)) {
+                continue;
+            }
+            res.add(text);
+        }
+        return String.join("", res);
+    }
+
+    public boolean equals(float size) {
+        return size == this.size;
+    }
+
+    public void addText(float x, float y, float width, float height, String text) {
+        Letter added;
+
+        if (" ".equals(text)) {
+            added = new Space(x, y, width, height);
+        } else if (text.matches(StringUtil.separatorsRegexp(ignoredSeparators))) {
+            added = new Separator(text, x, y, width, height);
+        } else {
+            added = new Letter(text, x, y, width, height);
+        }
+        letters.add(added);
+    }
+
+    protected void makeWords() {
+        if (this.wordsMade) {
+            return;
+        }
+        this.wordsMade = true;
+        Word currentWord = new Word();
+
+        for (int i = 0; i < letters.size(); i++) {
+            Letter l = letters.get(i);
+            if (!splitAllChars && currentWord.goodCandidate(l, spaceWidth, rotation)) {
+                // Si la lettre est un bon candidat
+                currentWord.addLetter(l);
+            } else {
+                // Sinon, on clôt le mot,
+                words.add(currentWord);
+                // On en recrée un nouveau
+                currentWord = new Word();
+                // Et on ajoute la lettre
+                currentWord.addLetter(l);
+                if (l.isSeparator()) {
+                    // Si le mot ajouté est un espace, on ferme
+                    // le mot directement
+                    words.add(currentWord);
+                    currentWord = new Word();
+                }
+            }
+        }
+
+        words.add(currentWord);
+        addMissingSpaces();
+    }
+
+    protected void addMissingSpaces() {
+        LinkedList<Word> wordsWithSpaces = new LinkedList<>();
+
+        for (int i = 0; i < words.size(); i++) {
+            Word w = words.get(i);
+            Word future = null;
+            if (i + 1 < words.size()) {
+                future = words.get(i + 1);
+            }
+
+            wordsWithSpaces.add(w);
+
+            if (!w.isSeparator() && future != null && !future.isSeparator()) {
+                // Si le mot en cours n'est pas un espace,
+                // quel mot précédent n'en était pas un
+                // et que le mot suivant existe
+
+                // On ajoute un espace à la liste des mots
+                spaceWidth = future.startX() - w.nextPosition();
+                if (spaceWidth == 0.0f) {
+                    continue;
+                }
+
+                Space sp = new Space(w.nextPosition(), 0.0f, spaceWidth, 1.0f);
+                Word spw = new Word();
+                spw.addLetter(sp);
+                wordsWithSpaces.add(spw);
+            }
+        }
+        words = wordsWithSpaces;
+    }
 }
index d9b5ad69c7857ff69f1407a38f12a5521fe82a5d..df7096459527140aeae25fc2b3b90ffe7e7fd5f7 100644 (file)
@@ -18,7 +18,7 @@ public class LayoutStripper extends PDFStreamEngine {
 
     protected PDPage currentPage;
     protected String ignoredSeparators;
-    protected boolean robust = false;
+    protected boolean splitAllChars = false;
     public Page layout;
 
     public LayoutStripper() throws IOException {
@@ -39,7 +39,7 @@ public class LayoutStripper extends PDFStreamEngine {
         this.resetEngine();
 
         this.currentPage = page;
-        layout = new Page(currentPage, i, this.ignoredSeparators);
+        layout = new Page(currentPage, i,this.splitAllChars, this.ignoredSeparators);
 
         PDResources resources = currentPage.findResources();
         PDStream contents = null;
@@ -80,8 +80,8 @@ public class LayoutStripper extends PDFStreamEngine {
         return c;
     }
 
-    public void setRobust(boolean robust) {
-        this.robust = robust;
+    public void setSplitAllChars(boolean splitAllChars) {
+        this.splitAllChars = splitAllChars;
     }
 
     public Page getLayout() {
index 85c6e9f15b6bef72507ceb614a8d64b58e6d3f82..2420b7d6cc59f6cbf657bb9a2430c674e7357d75 100644 (file)
@@ -16,75 +16,77 @@ import java.util.LinkedList;
  */
 public class Line extends LayoutElement {
 
-       LinkedList<Group> groups;
-       //.
-       float y;
-       float rotation;
-       float scaleX;
-       float scaleY;
-       String ignoredSeparators;
+    LinkedList<Group> groups;
+    //.
+    float y;
+    float rotation;
+    float scaleX;
+    float scaleY;
+    String ignoredSeparators;
+    protected boolean splitAllChars = false;
 
-       public Line(float y, float rotation, float scaleX, float scaleY, String ignoredSeparators) {
-               this.groups = new LinkedList<>();
+    public Line(float y, float rotation, float scaleX, float scaleY, boolean splitAllChars, String ignoredSeparators) {
+        this.groups = new LinkedList<>();
 
-               this.y = y;
-               this.rotation = rotation;
-               this.scaleX = scaleX;
-               this.scaleY = scaleY;
-               this.ignoredSeparators = ignoredSeparators;
-       }
+        this.y = y;
+        this.rotation = rotation;
+        this.scaleX = scaleX;
+        this.scaleY = scaleY;
+        this.splitAllChars = splitAllChars;
+        this.ignoredSeparators = ignoredSeparators;
+    }
 
-       public boolean equals(Line other) {
-               return (y == other.y && rotation == other.rotation && scaleX == other.scaleX && scaleY == other.scaleY);
-       }
+    public boolean equals(Line other) {
+        return (y == other.y && rotation == other.rotation && scaleX == other.scaleX && scaleY == other.scaleY);
+    }
 
-       public boolean equals(float y, float rotation, float scaleX, float scaleY) {
-               if (rotation == 0.0f) {
-                       return this.y == y && this.rotation == rotation && this.scaleX == scaleX && this.scaleY == scaleY;
-               } else {
-                       return this.rotation == rotation && this.scaleX == scaleX && this.scaleY == scaleY;
-               }
-       }
+    public boolean equals(float y, float rotation, float scaleX, float scaleY) {
+        if (rotation == 0.0f) {
+            return this.y == y && this.rotation == rotation && this.scaleX == scaleX && this.scaleY == scaleY;
+        } else {
+            return this.rotation == rotation && this.scaleX == scaleX && this.scaleY == scaleY;
+        }
+    }
 
-       public void addText(float size, float x, float y, float width, float height, String text, float spaceWidth) {
-               Group group = getGroup(size, spaceWidth, rotation);
-               group.addText(x, y, width, height, text);
-       }
+    public void addText(float size, float x, float y, float width, float height, String text, float spaceWidth) {
+        Group group = getGroup(size, spaceWidth, rotation);
+        group.addText(x, y, width, height, text);
+    }
 
-       public String asJSON(PDRectangle cropbox) {
-               if (groups.size() == 0) {
-                       return "";
-               }
-               ArrayList<String> res = new ArrayList<>();
-               for (Group group : groups) {
-                       String g = group.asJSON(cropbox, y, scaleX, scaleY, rotation);
-                       if (!"".equals(g)) {
-                               res.add(g);
-                       }
-               }
-               return String.join(",", res);
-       }
+    public String asJSON(PDRectangle cropbox) {
+        if (groups.size() == 0) {
+            return "";
+        }
+        ArrayList<String> res = new ArrayList<>();
+        for (Group group : groups) {
+            String g = group.asJSON(cropbox, y, scaleX, scaleY, rotation);
+            if (!"".equals(g)) {
+                res.add(g);
+            }
+        }
+        return String.join(",", res);
+    }
 
-       public String asText(PDRectangle cropbox){
-               if (groups.size() == 0) {
-                       return "";
-               }
-               ArrayList<String> res = new ArrayList<>();
-               for (Group group : groups) {
-                       String g = group.asText(cropbox);
-                       if (!"".equals(g)) {
-                               res.add(g);
-                       }
-               }
-               return StringEscapeUtils.escapeXml11(StringUtil.trim(StringUtil.condenseWhite(String.join("", res))));
-       }
+    public String asText(PDRectangle cropbox) {
+        if (groups.size() == 0) {
+            return "";
+        }
+        ArrayList<String> res = new ArrayList<>();
+        for (Group group : groups) {
+            String g = group.asText(cropbox);
+            if (!"".equals(g)) {
+                res.add(g);
+            }
+        }
+        return StringEscapeUtils.escapeXml11(StringUtil.trim(StringUtil.condenseWhite(String.join("", res))));
+    }
 
-       private Group getGroup( float size, float spaceWidth, float rotation) {
-               if (groups.size() == 0 || !groups.getLast().equals(size)) {
-                       Group newGroup = new Group(size,spaceWidth, ignoredSeparators, rotation);
-                       groups.add(newGroup);
-                       return newGroup;
-               }
-               return groups.getLast();
-       }
+    private Group getGroup(float size, float spaceWidth, float rotation) {
+        if (groups.size() == 0 || !groups.getLast().equals(size)) {
+            Group newGroup = new Group(size, spaceWidth,splitAllChars, ignoredSeparators, rotation);
+            groups.add(newGroup);
+            return newGroup;
+        }
+        return groups.getLast();
+    }
 }
index 80195eb7e7f8bf7b913ec8ea43ab6e1efe54ee8d..c458d3e91016bd7d8250409f9ad19cf7808db657 100644 (file)
@@ -33,152 +33,154 @@ import org.apache.pdfbox.util.TextPosition;
  */
 public class Page extends LayoutElement {
 
-       public int pageNumber;
-       public LinkedList<Line> lines;
-       protected HashMap<String, ColorSpace> _cs = new HashMap<>();
-       protected PDPage page;
-       protected PDRectangle cropbox;
-       protected String ignoredSeparators;
-
-       public Page(PDPage page, int pageNumber, String ignoredSeparators) {
-               this.page = page;
-               this.cropbox = page.findCropBox();
-               this.ignoredSeparators = ignoredSeparators;
-
-
-               this.pageNumber = pageNumber;
-               this.lines = new LinkedList<>();
-       }
-
-       public void addText(PDGraphicsState gs, Matrix textLineMatrix, Matrix textMatrix, TextPosition textPosition, String text) throws IOException {
-               PDTextState ts = gs.getTextState();
-               float rotation = new CubeMatrix(textLineMatrix).getRotation();
-               float size = textPosition.getFontSize() * textMatrix.getXScale();
-               float y = round(cropbox.getUpperRightY() - textPosition.getTextPos().getYPosition() - cropbox.getLowerLeftY());
-               float x = textPosition.getTextPos().getXPosition();
-               float width = textPosition.getWidth();
-               float height = textPosition.getHeight();
-               if (width == 0.0f) {
-                       width = textPosition.getWidthDirAdj();
-               }
-
-               if (size == 0.0f) {
-                       System.out.println(text);
-               }
-
-               // Determine l'espace normal dans cette font
-               float spaceWidth = textPosition.getWidthOfSpace();
-
-               float lineScaleX = textLineMatrix.getXScale();
-               float lineScaleY = textLineMatrix.getYScale();
-
-               // On normalise les échelles
-               float minScale = Math.abs(lineScaleX);
-               lineScaleX /= minScale;
-               lineScaleY /= minScale;
-
-               Line line = getLine(y, rotation, lineScaleX, lineScaleY);
-               line.addText(size, x, y, width, height, text, spaceWidth);
-       }
-
-       public String asJSON() {
-               String res = "";
-               res += "[";
-               ArrayList<String> jsonLines = new ArrayList<String>();
-               for (Line line : lines) {
-                       String lineJson = line.asJSON(cropbox);
-                       if ("".equals(lineJson)) {
-                               continue;
-                       }
-                       jsonLines.add(lineJson);
-               }
-               res += String.join(",", jsonLines);
-               res += "]";
-               return res;
-       }
-
-       public String asText() {
-               ArrayList<String> textLines = new ArrayList<String>();
-               for (Line line : lines) {
-                       String lineText = line.asText(this.cropbox);
-                       if ("".equals(lineText)) {
-                               continue;
-                       }
-                       textLines.add(lineText);
-               }
-               return String.join(" ", textLines);
-       }
-
-       public String asHTML() {
-               ArrayList<String> textLines = new ArrayList<String>();
-               for (Line line : lines) {
-                       String lineText = line.asText(this.cropbox);
-                       if ("".equals(lineText)) {
-                               continue;
-                       }
-                       textLines.add(lineText);
-               }
-               return "<div>\n\t<p>" +String.join("</p>\n\t<p>", textLines) + "</p>\n</div>";
-       }
-
-       protected Line getLine(float y, float rotation, float scaleX, float scaleY) {
-               if (lines.size() == 0 || !lines.getLast().equals(y, rotation, scaleX, scaleY)) {
-                       Line newLine = new Line(y, rotation, scaleX, scaleY, ignoredSeparators);
-                       lines.add(newLine);
-                       return newLine;
-               }
-               return lines.getLast();
-       }
-
-       protected String parseColor(PDTextState ts, PDGraphicsState gs)
-                       throws IOException {
-               PDColorState pcs;
-
-               if (ts.getRenderingMode() == PDTextState.RENDERING_MODE_FILL_TEXT) {
-                       pcs = gs.getNonStrokingColor();
-               } else if (ts.getRenderingMode() == PDTextState.RENDERING_MODE_STROKE_TEXT) {
-                       pcs = gs.getStrokingColor();
-               } else if (ts.getRenderingMode() == PDTextState.RENDERING_MODE_NEITHER_FILL_NOR_STROKE_TEXT) {
-                       pcs = gs.getStrokingColor();
-               } else {
-                       pcs = gs.getStrokingColor();
-               }
-
-               ColorSpace cs = getColorSpace(pcs.getColorSpace());
-
-               float[] components = pcs.getJavaColor().getColorComponents(null);
-               float[] componentsRGB = cs.toRGB(components);
-
-               Color c = new Color(0, 0, 0);
-
-               if (componentsRGB.length == 3) {
-                       c = new Color(componentsRGB[0], componentsRGB[1], componentsRGB[2]);
-               } else if (components.length == 4) {
-                       c = new Color(componentsRGB[0], componentsRGB[1], componentsRGB[2],
-                                       componentsRGB[3]);
-               }
-
-               String color = "#" + Integer.toHexString(c.getRGB());
-               return color;
-       }
-
-       protected ColorSpace _loadColorSpace(String path) throws IOException {
-               if (!_cs.containsKey(path)) {
-                       _cs.put(path,
-                                       new ICC_ColorSpace(ICC_Profile.getInstance(ResourceLoader.loadResource(path))));
-               }
-
-               return _cs.get(path);
-
-       }
-
-       protected ColorSpace getColorSpace(PDColorSpace pdfCS) throws IOException {
-               ColorSpace cs = pdfCS.getJavaColorSpace();
-               if (pdfCS.getName().equals("DeviceCMYK")) {
-                       cs = _loadColorSpace("com/adobe/icc/cmyk/USWebCoatedSWOP.icc");
-               }
-
-               return cs;
-
-       }
+    public int pageNumber;
+    public LinkedList<Line> lines;
+    protected HashMap<String, ColorSpace> _cs = new HashMap<>();
+    protected PDPage page;
+    protected PDRectangle cropbox;
+    protected String ignoredSeparators;
+    protected boolean splitAllChars = false;
+
+    public Page(PDPage page, int pageNumber, boolean splitAllChars, String ignoredSeparators) {
+        this.page = page;
+        this.cropbox = page.findCropBox();
+        this.ignoredSeparators = ignoredSeparators;
+        this.splitAllChars = splitAllChars;
+
+
+        this.pageNumber = pageNumber;
+        this.lines = new LinkedList<>();
+    }
+
+    public void addText(PDGraphicsState gs, Matrix textLineMatrix, Matrix textMatrix, TextPosition textPosition, String text) throws IOException {
+        PDTextState ts = gs.getTextState();
+        float rotation = new CubeMatrix(textLineMatrix).getRotation();
+        float size = textPosition.getFontSize() * textMatrix.getXScale();
+        float y = round(cropbox.getUpperRightY() - textPosition.getTextPos().getYPosition() - cropbox.getLowerLeftY());
+        float x = textPosition.getTextPos().getXPosition();
+        float width = textPosition.getWidth();
+        float height = textPosition.getHeight();
+        if (width == 0.0f) {
+            width = textPosition.getWidthDirAdj();
+        }
+
+        if (size == 0.0f) {
+            System.out.println(text);
+        }
+
+        // Determine l'espace normal dans cette font
+        float spaceWidth = textPosition.getWidthOfSpace();
+
+        float lineScaleX = textLineMatrix.getXScale();
+        float lineScaleY = textLineMatrix.getYScale();
+
+        // On normalise les échelles
+        float minScale = Math.abs(lineScaleX);
+        lineScaleX /= minScale;
+        lineScaleY /= minScale;
+
+        Line line = getLine(y, rotation, lineScaleX, lineScaleY);
+        line.addText(size, x, y, width, height, text, spaceWidth);
+    }
+
+    public String asJSON() {
+        String res = "";
+        res += "[";
+        ArrayList<String> jsonLines = new ArrayList<String>();
+        for (Line line : lines) {
+            String lineJson = line.asJSON(cropbox);
+            if ("".equals(lineJson)) {
+                continue;
+            }
+            jsonLines.add(lineJson);
+        }
+        res += String.join(",", jsonLines);
+        res += "]";
+        return res;
+    }
+
+    public String asText() {
+        ArrayList<String> textLines = new ArrayList<String>();
+        for (Line line : lines) {
+            String lineText = line.asText(this.cropbox);
+            if ("".equals(lineText)) {
+                continue;
+            }
+            textLines.add(lineText);
+        }
+        return String.join(" ", textLines);
+    }
+
+    public String asHTML() {
+        ArrayList<String> textLines = new ArrayList<String>();
+        for (Line line : lines) {
+            String lineText = line.asText(this.cropbox);
+            if ("".equals(lineText)) {
+                continue;
+            }
+            textLines.add(lineText);
+        }
+        return "<div>\n\t<p>" + String.join("</p>\n\t<p>", textLines) + "</p>\n</div>";
+    }
+
+    protected Line getLine(float y, float rotation, float scaleX, float scaleY) {
+        if (lines.size() == 0 || !lines.getLast().equals(y, rotation, scaleX, scaleY)) {
+            Line newLine = new Line(y, rotation, scaleX, scaleY,splitAllChars, ignoredSeparators);
+            lines.add(newLine);
+            return newLine;
+        }
+        return lines.getLast();
+    }
+
+    protected String parseColor(PDTextState ts, PDGraphicsState gs)
+            throws IOException {
+        PDColorState pcs;
+
+        if (ts.getRenderingMode() == PDTextState.RENDERING_MODE_FILL_TEXT) {
+            pcs = gs.getNonStrokingColor();
+        } else if (ts.getRenderingMode() == PDTextState.RENDERING_MODE_STROKE_TEXT) {
+            pcs = gs.getStrokingColor();
+        } else if (ts.getRenderingMode() == PDTextState.RENDERING_MODE_NEITHER_FILL_NOR_STROKE_TEXT) {
+            pcs = gs.getStrokingColor();
+        } else {
+            pcs = gs.getStrokingColor();
+        }
+
+        ColorSpace cs = getColorSpace(pcs.getColorSpace());
+
+        float[] components = pcs.getJavaColor().getColorComponents(null);
+        float[] componentsRGB = cs.toRGB(components);
+
+        Color c = new Color(0, 0, 0);
+
+        if (componentsRGB.length == 3) {
+            c = new Color(componentsRGB[0], componentsRGB[1], componentsRGB[2]);
+        } else if (components.length == 4) {
+            c = new Color(componentsRGB[0], componentsRGB[1], componentsRGB[2],
+                    componentsRGB[3]);
+        }
+
+        String color = "#" + Integer.toHexString(c.getRGB());
+        return color;
+    }
+
+    protected ColorSpace _loadColorSpace(String path) throws IOException {
+        if (!_cs.containsKey(path)) {
+            _cs.put(path,
+                    new ICC_ColorSpace(ICC_Profile.getInstance(ResourceLoader.loadResource(path))));
+        }
+
+        return _cs.get(path);
+
+    }
+
+    protected ColorSpace getColorSpace(PDColorSpace pdfCS) throws IOException {
+        ColorSpace cs = pdfCS.getJavaColorSpace();
+        if (pdfCS.getName().equals("DeviceCMYK")) {
+            cs = _loadColorSpace("com/adobe/icc/cmyk/USWebCoatedSWOP.icc");
+        }
+
+        return cs;
+
+    }
 }
index 3720a00d31f4865673fdf4d742dbc407f86518bc..9a3a81d17a73099905c9029235c59cdc3e0fd6f9 100644 (file)
@@ -470,10 +470,8 @@ class PDFTools
             $fwstk->setArg('--ignoreSeparators "' . $ignoreSeparators . '"');
         }
         $fwstk->execute();
-
     }
 
-
     public static function extractHighlightsData($pdf, $out, $mode = 'standard')
     {
         $out .= 'texts';