<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
- <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410">
- <change beforePath="$PROJECT_DIR$/../../../.idea/deployment.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../../../.idea/deployment.xml" afterDir="false" />
- <change beforePath="$PROJECT_DIR$/../../../.idea/misc.xml" beforeDir="false" afterPath="$PROJECT_DIR$/../../../.idea/misc.xml" afterDir="false" />
- <change beforePath="$PROJECT_DIR$/../../../src/PDFTools.php" beforeDir="false" afterPath="$PROJECT_DIR$/../../../src/PDFTools.php" afterDir="false" />
- </list>
+ <list default="true" id="f146bc67-2578-4de3-9db2-94d2d43e9e83" name="Default" comment="wip #5410" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<configuration name="extract texts" type="Application" factoryName="Application">
<option name="MAIN_CLASS_NAME" value="com.fluidbook.fwstk.Main" />
<module name="fwstk" />
- <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\original.pdf --extractTextsMethod pdfbox --extractTexts C:\Users\vince\Desktop\%s%d.txt --threads 1" />
+ <option name="PROGRAM_PARAMETERS" value="--input C:\Users\vince\Desktop\20929.pdf --mode robust --extractTextsMethod fluidbook --extractTexts C:\Users\vince\Desktop\20929\%s%d.txt --threads 1" />
<method v="2">
<option name="Make" enabled="true" />
</method>
<workItem from="1692895286414" duration="2259000" />
<workItem from="1692974658841" duration="8000" />
<workItem from="1692974700537" duration="688000" />
- <workItem from="1694090487471" duration="2806000" />
+ <workItem from="1694090487471" duration="6051000" />
</task>
<task id="LOCAL-00001" summary="wip #1111 @0.5">
<created>1487172253077</created>
static Float linkOffsetX = 0.0f;
static Float linkOffsetY = 0.0f;
- public static void main(String[] args) throws IOException,
- COSVisitorException, ClassNotFoundException, NullPointerException,
- InterruptedException {
+ public static void main(String[] args) throws IOException, COSVisitorException, ClassNotFoundException, NullPointerException, InterruptedException {
if (args.length < 0) {
printUsage();
String defined = "";
String linksOutput = "";
String cutmode = "";
- Boolean infos = false;
+ boolean infos = false;
String textsOutput = "";
String imageOutput = "";
String layoutOutput = "";
String ignoredSeparators = "";
String textsExtractionMethod = "pdfbox";
+ boolean robust = false;
PDDocument doc = null;
Integer[] pages = null;
} else if (args[i].trim().compareTo("-v") == 0) {
printVersion();
return;
- } else if (args[i].trim().compareTo("-h") == 0
- || args[i].trim().compareTo("--help") == 0) {
+ } else if (args[i].trim().compareTo("-h") == 0 || args[i].trim().compareTo("--help") == 0) {
printUsage();
return;
} else if (args[i].trim().compareTo("--trim") == 0) {
imageOutput = args[i].trim();
} else if (args[i].trim().compareTo("--ignoreSeparators") == 0) {
i++;
- ignoredSeparators = args[i].trim().replace("{SPACE}"," ");
+ ignoredSeparators = args[i].trim().replace("{SPACE}", " ");
} else if (args[i].trim().compareTo("--linkOffsetX") == 0) {
i++;
linkOffsetX = Float.parseFloat(args[i].trim());
} else if (args[i].trim().compareTo("--threads") == 0) {
i++;
threads = Integer.parseInt(args[i].trim());
+ } else if (args[i].trim().compareTo("--mode") == 0) {
+ i++;
+ robust = args[i].trim() == "robust";
}
}
doc = openDocument(input);
}
Main m = new Main();
- m.extractTexts(doc, textsOutput, textsExtractionMethod, pages, ignoredSeparators, input);
+ m.extractTexts(doc, textsOutput, textsExtractionMethod, robust, pages, ignoredSeparators, input);
}
if (!"".equals(layoutOutput)) {
if (doc == null) {
doc = openDocument(input);
}
Main m = new Main();
- m.extractLayout(doc, layoutOutput, ignoredSeparators);
+ m.extractLayout(doc, layoutOutput, robust, ignoredSeparators);
}
if (imageOutput.compareTo("") != 0) {
if (doc == null) {
writer.writeImage(doc, "png", "", 1, doc.getNumberOfPages(), imageOutput);
}
- private void extractLayout(PDDocument doc, String layoutOutput, String ignoredSeparators) throws IOException {
+ private void extractLayout(PDDocument doc, String layoutOutput, boolean robust, String ignoredSeparators) throws IOException {
LayoutStripper stripper = null;
List pages = doc.getDocumentCatalog().getAllPages();
while (pagesIter.hasNext()) {
stripper = new LayoutStripper();
+ stripper.setRobust(robust);
stripper.setIgnoredSeparators(ignoredSeparators);
PDPage nextPage = (PDPage) pagesIter.next();
i++;
// For each page, one stripper, otherwise, there is bug with chars widths
stripper.process(nextPage, i);
Page layout = stripper.getLayout();
- BufferedWriter out
- = new BufferedWriter(
- new OutputStreamWriter(
- new FileOutputStream(layoutOutput.replace("%d", ""
- + i)), "UTF-8"));
+ BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(layoutOutput.replace("%d", "" + i)), "UTF-8"));
out.write(layout.asJSON());
out.close();
}
private static Boolean getInfos(PDDocument doc) throws IOException, COSVisitorException {
ArrayList<String> res = new ArrayList<>();
// General informations
- String[] fields = {"Author", "Title", "CreationDate", "Creator",
- "Keywords", "ModificationDate", "Producer", "Subject",
- "Trapped", "Version"};
+ String[] fields = {"Author", "Title", "CreationDate", "Creator", "Keywords", "ModificationDate", "Producer", "Subject", "Trapped", "Version"};
PDDocumentInformation infos = doc.getDocumentInformation();
for (String k : fields) {
String v = infos.getCustomMetadataValue(k);
}
// Size
- if (page.getRotation() != null
- && (page.getRotation() == 90 || page.getRotation() == 270)) {
- res.add("Page " + numero + " size:\t\t"
- + Math.abs(cropBox.getHeight()) + " pts x "
- + Math.abs(cropBox.getWidth()) + " pts");
+ if (page.getRotation() != null && (page.getRotation() == 90 || page.getRotation() == 270)) {
+ res.add("Page " + numero + " size:\t\t" + Math.abs(cropBox.getHeight()) + " pts x " + Math.abs(cropBox.getWidth()) + " pts");
// Boxes
- res.add("Page " + numero + " CropBox:\t"
- + cropBox.getLowerLeftY() + "\t"
- + cropBox.getUpperRightX() + "\t"
- + cropBox.getUpperRightY() + "\t"
- + cropBox.getLowerLeftX() + "\t");
- res.add("Page " + numero + " MediaBox:\t"
- + mediaBox.getLowerLeftY() + "\t"
- + mediaBox.getUpperRightX() + "\t"
- + mediaBox.getUpperRightY() + "\t"
- + mediaBox.getLowerLeftX() + "\t");
- res.add("Page " + numero + " TrimBox:\t"
- + trimBox.getLowerLeftY() + "\t"
- + trimBox.getUpperRightX() + "\t"
- + trimBox.getUpperRightY() + "\t"
- + trimBox.getLowerLeftX() + "\t");
+ res.add("Page " + numero + " CropBox:\t" + cropBox.getLowerLeftY() + "\t" + cropBox.getUpperRightX() + "\t" + cropBox.getUpperRightY() + "\t" + cropBox.getLowerLeftX() + "\t");
+ res.add("Page " + numero + " MediaBox:\t" + mediaBox.getLowerLeftY() + "\t" + mediaBox.getUpperRightX() + "\t" + mediaBox.getUpperRightY() + "\t" + mediaBox.getLowerLeftX() + "\t");
+ res.add("Page " + numero + " TrimBox:\t" + trimBox.getLowerLeftY() + "\t" + trimBox.getUpperRightX() + "\t" + trimBox.getUpperRightY() + "\t" + trimBox.getLowerLeftX() + "\t");
} else {
- res.add("Page " + numero + " size:\t\t"
- + Math.abs(cropBox.getWidth()) + " pts x "
- + Math.abs(cropBox.getHeight()) + " pts");
+ res.add("Page " + numero + " size:\t\t" + Math.abs(cropBox.getWidth()) + " pts x " + Math.abs(cropBox.getHeight()) + " pts");
// Boxes
- res.add("Page " + numero + " CropBox:\t"
- + cropBox.getLowerLeftX() + "\t"
- + cropBox.getUpperRightY() + "\t"
- + cropBox.getUpperRightX() + "\t"
- + cropBox.getLowerLeftY() + "\t");
- res.add("Page " + numero + " MediaBox:\t"
- + mediaBox.getLowerLeftX() + "\t"
- + mediaBox.getUpperRightY() + "\t"
- + mediaBox.getUpperRightX() + "\t"
- + mediaBox.getLowerLeftY() + "\t");
- res.add("Page " + numero + " TrimBox:\t"
- + trimBox.getLowerLeftX() + "\t"
- + trimBox.getUpperRightY() + "\t"
- + trimBox.getUpperRightX() + "\t"
- + trimBox.getLowerLeftY() + "\t");
+ res.add("Page " + numero + " CropBox:\t" + cropBox.getLowerLeftX() + "\t" + cropBox.getUpperRightY() + "\t" + cropBox.getUpperRightX() + "\t" + cropBox.getLowerLeftY() + "\t");
+ res.add("Page " + numero + " MediaBox:\t" + mediaBox.getLowerLeftX() + "\t" + mediaBox.getUpperRightY() + "\t" + mediaBox.getUpperRightX() + "\t" + mediaBox.getLowerLeftY() + "\t");
+ res.add("Page " + numero + " TrimBox:\t" + trimBox.getLowerLeftX() + "\t" + trimBox.getUpperRightY() + "\t" + trimBox.getUpperRightX() + "\t" + trimBox.getLowerLeftY() + "\t");
}
}
return changes;
}
- private static void addLabels(ArrayList<String> res, PDPageLabels labels,
- int pages) {
+ private static void addLabels(ArrayList<String> res, PDPageLabels labels, int pages) {
// Get the raw list
String[] labelList = labels.getLabelsByPageIndices();
rangeRef = range;
}
}
- res.add("NumberSectionsDelimiters:\t\t"
- + delimiters.substring(0, delimiters.length() - 1));
+ res.add("NumberSectionsDelimiters:\t\t" + delimiters.substring(0, delimiters.length() - 1));
}
- private static void addBookmark(PDDocument doc, ArrayList<String> res,
- PDOutlineNode bookmark, int level) throws IOException {
+ private static void addBookmark(PDDocument doc, ArrayList<String> res, PDOutlineNode bookmark, int level) throws IOException {
PDOutlineItem current = bookmark.getFirstChild();
while (current != null) {
try {
res.add("BookmarkTitle:\t\t" + StringEscapeUtils.escapeHtml4(current.getTitle()));
res.add("BookmarkLevel:\t\t" + level);
- res.add("BookmarkPage:\t\t"
- + getPageFromAction(doc, current.getAction()));
+ res.add("BookmarkPage:\t\t" + getPageFromAction(doc, current.getAction()));
addBookmark(doc, res, current, level + 1);
- }catch (Exception e){
+ } catch (Exception e) {
}
current = current.getNextSibling();
}
}
- private static void cutDocument(PDDocument doc, String input,
- String output, String cutmode) throws COSVisitorException,
- IOException {
+ private static void cutDocument(PDDocument doc, String input, String output, String cutmode) throws COSVisitorException, IOException {
- System.out.println("Cut document of " + doc.getNumberOfPages()
- + " with mode " + cutmode);
+ System.out.println("Cut document of " + doc.getNumberOfPages() + " with mode " + cutmode);
ArrayList<PDDocument> copies = duplicatePages(doc, input, cutmode);
cutPages(doc, cutmode);
float decalage = ((float) decalageCrans) * w;
newbox.move(decalage, 0f);
- System.out.println("Set cropbox of page " + page + " from "
- + pdfPage.getCropBox() + " to " + newbox + " (offset : "
- + decalage + ")");
+ System.out.println("Set cropbox of page " + page + " from " + pdfPage.getCropBox() + " to " + newbox + " (offset : " + decalage + ")");
pdfPage.setCropBox(newbox);
pdfPage.setMediaBox(newbox);
}
}
- private static ArrayList<PDDocument> duplicatePages(PDDocument doc,
- String input, String cutmode) throws IOException {
+ private static ArrayList<PDDocument> duplicatePages(PDDocument doc, String input, String cutmode) throws IOException {
List<PDPage> pageList = doc.getDocumentCatalog().getAllPages();
int originalSize = pageList.size();
System.out.println("Skip page " + page);
continue;
}
- System.out.println("Duplicate page " + page + " :: cursor is at "
- + cursor);
+ System.out.println("Duplicate page " + page + " :: cursor is at " + cursor);
// Duplicate page
for (int j = 0; j < duplicateTime; j++) {
List<PDPage> l = copies.get(j).getDocumentCatalog().getAllPages();
}
- private void extractTexts(PDDocument doc, String textsOutput, String method,
- Integer[] pages, String ignoredSeparators, String input) throws IOException, ClassNotFoundException,
- NullPointerException, InterruptedException {
+ private void extractTexts(PDDocument doc, String textsOutput, String method, boolean robust, Integer[] pages, String ignoredSeparators, String input) throws IOException, ClassNotFoundException, NullPointerException, InterruptedException {
long s = Calendar.getInstance().getTimeInMillis();
PDDocument[] tdoc = new PDDocument[threads];
PDDocument d;
- int totalThreads = Math.max(1,
- Math.min(Math.round(pages.length / 50.0f), Main.threads));
+ int totalThreads = Math.max(1, Math.min(Math.round(pages.length / 50.0f), Main.threads));
System.out.println("Total threads " + totalThreads);
d = doc;
}
- TextsThread t = new TextsThread(d, 0.5f, textsOutput, method, pages, i, totalThreads, ignoredSeparators, input);
+ TextsThread t = new TextsThread(d, 0.5f, textsOutput, method, robust, pages, i, totalThreads, ignoredSeparators, input);
t.setPriority(Thread.MIN_PRIORITY);
t.start();
tlist[i] = t;
}
}
- System.out.println("Extraction des textes with " + method + " : "
- + ((Calendar.getInstance().getTimeInMillis() - s) / 1000)
- + "s");
+ System.out.println("Extraction des textes with " + method + " : " + ((Calendar.getInstance().getTimeInMillis() - s) / 1000) + "s");
}
- public static void updateCropBox(PDDocument doc, String output,
- String refbox, Integer[] pages, String defined) throws IOException,
- COSVisitorException {
+ public static void updateCropBox(PDDocument doc, String output, String refbox, Integer[] pages, String defined) throws IOException, COSVisitorException {
System.out.println("updateCropBox");
if (!"".equals(defined)) {
updateCropBoxDefined(doc, defined);
return;
}
- private static void updateCropBoxDefined(PDDocument doc, String defined)
- throws IOException, COSVisitorException {
+ private static void updateCropBoxDefined(PDDocument doc, String defined) throws IOException, COSVisitorException {
String[] e = defined.split("*");
for (int i = 0; i < e.length; i++) {
String[] e1 = e[i].split(",");
return box;
}
- public static void extractLinks(PDDocument doc, String linksOutput,
- Integer[] pages) throws IOException {
+ public static void extractLinks(PDDocument doc, String linksOutput, Integer[] pages) throws IOException {
List<PDPage> pageList = doc.getDocumentCatalog().getAllPages();
}
}
- private static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p,
- String file) throws IOException {
+ private static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p, String file) throws IOException {
ArrayList<Link> listLinks = extractLinksOfPage(doc, pageNumber, p);
saveLinks(file.replaceFirst("%d", "" + pageNumber), listLinks);
return listLinks;
}
- public static void saveLinks(String file, ArrayList<Link> listLinks)
- throws IOException {
+ public static void saveLinks(String file, ArrayList<Link> listLinks) throws IOException {
FileIO out = new FileIO(file);
out.open("w");
out.output.writeBytes(Link.header());
}
public static String getPageFromAction(PDDocument doc, PDAction a) throws IOException {
- if(a instanceof PDActionRemoteGoTo){
+ if (a instanceof PDActionRemoteGoTo) {
return "-1";
}
PDActionGoTo aa = (PDActionGoTo) a;
return "-1";
}
- public static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p)
- throws IOException {
+ public static ArrayList<Link> extractLinksOfPage(PDDocument doc, int pageNumber, PDPage p) throws IOException {
System.out.println(pageNumber);
ArrayList<Link> listLinks = new ArrayList<>();
Link myLink;
System.out.println(link.getRectangle().getHeight());
myLink.rect = link.getRectangle();
- if (myLink.rect.getWidth() == 0.0
- || myLink.rect.getHeight() == 0.0) {
+ if (myLink.rect.getWidth() == 0.0 || myLink.rect.getHeight() == 0.0) {
System.out.println("Skip link :: surface == 0");
continue;
}
public class TextsThread extends Thread {
- protected PDDocument doc;
- protected int threadIndex;
- protected String textsOutput;
- protected Integer[] pages;
- protected int totalThreads;
- protected float tolerance;
- protected String ignoredSeparators = "";
- protected String docURL;
- protected String[] trimchars;
- protected String method;
-
- TextsThread(PDDocument doc, float tolerance, String textsOutput, String method,
- Integer[] pages, int index, int totalThreads, String ignoredSeparators, String docURL) {
- this.doc = doc;
- this.threadIndex = index;
- this.textsOutput = textsOutput;
- this.pages = pages;
- this.totalThreads = totalThreads;
- this.tolerance = tolerance;
- this.ignoredSeparators = ignoredSeparators;
- this.docURL = docURL;
- this.method = method;
-
- trimchars = new String[1];
- trimchars[0] = " ";
-
- }
-
- @Override
- public void run() {
- String index;
- String pindex;
-
- String memoHTML = "";
- String html = "";
- String phtml = "";
-
- String text = "";
- String ptext = "";
-
- Boolean sortByPosition = false;
- Boolean separateByBeads = false;
- Boolean suppressDuplicate = true;
-
- for (Integer i : pages) {
- if (i % totalThreads != threadIndex) {
- continue;
- }
-
- System.out.println("Parsing page " + i+" with "+method);
- String file = textsOutput.replaceFirst("%d", "" + i);
-
- // Poppler
- BufferedWriter out;
-
- if ("poppler".equals(method)) {
- try {
- Process proc;
- proc = Runtime.getRuntime().exec("pdftotext -f " + i + " -l " + i + " -enc UTF-8 -eol unix -nopgbrk " + this.docURL + " - ");
-
- InputStream output = proc.getInputStream();
- StringWriter writer = new StringWriter();
- IOUtils.copy(output, writer, "UTF-8");
- phtml = writer.toString();
- phtml = cleanhtml(phtml);
-
- ptext = html2text(phtml);
- pindex = text2index(ptext);
- ptext = StringUtil.removeAccents(ptext);
-
- String ppfile = file.replaceFirst("%s", "pp");
- String pifile = file.replaceFirst("%s", "pi");
- String phfile = file.replaceFirst("%s", "ph");
-
- try {
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(ppfile), "UTF8"));
- out.write(ptext);
- out.close();
-
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(pifile), "UTF8"));
- out.write(pindex);
- out.close();
-
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(phfile), "UTF8"));
- out.write(phtml);
- out.close();
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- phtml = "";
- pindex = "";
- ptext = "";
- html = "";
- text = "";
- index = "";
-
- } catch (IOException ex) {
- ex.printStackTrace();
- }
- } else if ("pdfbox".equals(method)) {
-
- /// PDFBOX
-
- try {
- CustomStripper htmlStripper;
- htmlStripper = new CustomStripper("UTF-8");
-
- htmlStripper.setSortByPosition(sortByPosition);
- htmlStripper.setShouldSeparateByBeads(separateByBeads);
- htmlStripper.setAverageCharTolerance(tolerance);
- htmlStripper.setSpacingTolerance(tolerance);
- htmlStripper.setStartPage(i);
- htmlStripper.setEndPage(i);
- htmlStripper.setSuppressDuplicateOverlappingText(suppressDuplicate);
- html = htmlStripper.getHTML(doc);
- html = cleanhtml(html);
-
- text = htmlStripper.getText(doc);
- text = html2text(text);
- index = text2index(text);
-
- text = StringUtil.removeAccents(text);
- String pfile = file.replaceFirst("%s", "p");
- String ifile = file.replaceFirst("%s", "i");
- String hfile = file.replaceFirst("%s", "h");
-
- try {
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(pfile), "UTF8"));
- out.write(text);
- out.close();
-
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(ifile), "UTF8"));
- out.write(index);
- out.close();
-
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(hfile), "UTF8"));
- out.write(html);
- out.close();
- } catch (UnsupportedEncodingException e) {
-
- e.printStackTrace();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- } catch (IOException ex) {
- ex.printStackTrace();
- }
-
- memoHTML = html;
- } else if ("fluidbook".equals(method)) {
- // Fluidbook
- try {
- LayoutStripper layoutStripper;
- layoutStripper = new LayoutStripper();
- layoutStripper.setIgnoredSeparators(ignoredSeparators);
- layoutStripper.process((PDPage) doc.getDocumentCatalog().getAllPages().get(i - 1), i);
-
- String fbhtml = layoutStripper.getLayout().asHTML();
- String fbtext = layoutStripper.getLayout().asText();
-
- String fpfile = file.replaceFirst("%s", "fp");
- String fifile = file.replaceFirst("%s", "fi");
- String fhfile = file.replaceFirst("%s", "fh");
-
- fbtext = html2text(fbtext);
-
- String fbindex = text2index(fbtext);
-
- try {
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(fpfile), "UTF8"));
- out.write(fbtext);
- out.close();
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(fifile), "UTF8"));
- out.write(fbindex);
- out.close();
- out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(fhfile), "UTF8"));
- out.write(fbhtml);
- out.close();
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- } catch (Exception ex) {
- ex.printStackTrace();
- }
- }
- }
- }
-
- protected String html2text(String text) {
- text = StringUtil.separateLigatures(text);
- text = text.toLowerCase();
- text = StringUtil.removeControl(text);
- text = StringUtil.removePoints(text, ignoredSeparators);
- text = StringUtil.condenseWhite(text);
- text = StringUtil.trim(text, trimchars);
- return text;
- }
-
- protected String cleanhtml(String html) {
- return StringUtil.normalizeWhite(html);
- }
-
- protected String text2index(String text) {
- TreeMap<String, Word> index = new TreeMap<>();
-
- String[] words = text.split(" ");
- for (String word : words) {
- String woa = StringUtil.removeAccents(word);
- if (index.containsKey(woa)) {
- index.get(woa).addWord(word);
- } else {
- Word w = new Word();
- w.addWord(word);
- index.put(woa, w);
- }
- }
- String indexString = "";
-
- for (Entry<String, Word> e : index.entrySet()) {
- indexString += e.getKey() + "," + e.getValue().toString()
- + "\n";
- }
- if (indexString.length() > 0) {
- indexString = indexString.substring(0,
- indexString.length() - 1);
- }
-
- return indexString;
- }
+ protected PDDocument doc;
+ protected int threadIndex;
+ protected String textsOutput;
+ protected Integer[] pages;
+ protected int totalThreads;
+ protected float tolerance;
+ protected String ignoredSeparators = "";
+ protected String docURL;
+ protected String[] trimchars;
+ protected String method;
+
+ protected boolean robust;
+
+ TextsThread(PDDocument doc, float tolerance, String textsOutput, String method, boolean robust,
+ Integer[] pages, int index, int totalThreads, String ignoredSeparators, String docURL) {
+ this.doc = doc;
+ this.threadIndex = index;
+ this.textsOutput = textsOutput;
+ this.pages = pages;
+ this.totalThreads = totalThreads;
+ this.tolerance = tolerance;
+ this.ignoredSeparators = ignoredSeparators;
+ this.docURL = docURL;
+ this.method = method;
+ this.robust = robust;
+
+ trimchars = new String[1];
+ trimchars[0] = " ";
+
+ }
+
+ @Override
+ public void run() {
+ String index;
+ String pindex;
+
+ String memoHTML = "";
+ String html = "";
+ String phtml = "";
+
+ String text = "";
+ String ptext = "";
+
+ Boolean sortByPosition = false;
+ Boolean separateByBeads = false;
+ Boolean suppressDuplicate = true;
+
+ for (Integer i : pages) {
+ if (i % totalThreads != threadIndex) {
+ continue;
+ }
+
+ System.out.println("Parsing page " + i + " with " + method);
+ String file = textsOutput.replaceFirst("%d", "" + i);
+
+ // Poppler
+ BufferedWriter out;
+
+ if ("poppler".equals(method)) {
+ try {
+ Process proc;
+ proc = Runtime.getRuntime().exec("pdftotext -f " + i + " -l " + i + " -enc UTF-8 -eol unix -nopgbrk " + this.docURL + " - ");
+
+ InputStream output = proc.getInputStream();
+ StringWriter writer = new StringWriter();
+ IOUtils.copy(output, writer, "UTF-8");
+ phtml = writer.toString();
+ phtml = cleanhtml(phtml);
+
+ ptext = html2text(phtml);
+ pindex = text2index(ptext);
+ ptext = StringUtil.removeAccents(ptext);
+
+ String ppfile = file.replaceFirst("%s", "pp");
+ String pifile = file.replaceFirst("%s", "pi");
+ String phfile = file.replaceFirst("%s", "ph");
+
+ try {
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(ppfile), "UTF8"));
+ out.write(ptext);
+ out.close();
+
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(pifile), "UTF8"));
+ out.write(pindex);
+ out.close();
+
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(phfile), "UTF8"));
+ out.write(phtml);
+ out.close();
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ phtml = "";
+ pindex = "";
+ ptext = "";
+ html = "";
+ text = "";
+ index = "";
+
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ } else if ("pdfbox".equals(method)) {
+
+ /// PDFBOX
+
+ try {
+ CustomStripper htmlStripper;
+ htmlStripper = new CustomStripper("UTF-8");
+
+ htmlStripper.setSortByPosition(sortByPosition);
+ htmlStripper.setShouldSeparateByBeads(separateByBeads);
+ htmlStripper.setAverageCharTolerance(tolerance);
+ htmlStripper.setSpacingTolerance(tolerance);
+ htmlStripper.setStartPage(i);
+ htmlStripper.setEndPage(i);
+ htmlStripper.setSuppressDuplicateOverlappingText(suppressDuplicate);
+ html = htmlStripper.getHTML(doc);
+ html = cleanhtml(html);
+
+ text = htmlStripper.getText(doc);
+ text = html2text(text);
+ index = text2index(text);
+
+ text = StringUtil.removeAccents(text);
+ String pfile = file.replaceFirst("%s", "p");
+ String ifile = file.replaceFirst("%s", "i");
+ String hfile = file.replaceFirst("%s", "h");
+
+ try {
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(pfile), "UTF8"));
+ out.write(text);
+ out.close();
+
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(ifile), "UTF8"));
+ out.write(index);
+ out.close();
+
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(hfile), "UTF8"));
+ out.write(html);
+ out.close();
+ } catch (UnsupportedEncodingException e) {
+
+ e.printStackTrace();
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+
+ memoHTML = html;
+ } else if ("fluidbook".equals(method)) {
+ // Fluidbook
+ try {
+ LayoutStripper layoutStripper;
+ layoutStripper = new LayoutStripper();
+ layoutStripper.setRobust(robust);
+ layoutStripper.setIgnoredSeparators(ignoredSeparators);
+ layoutStripper.process((PDPage) doc.getDocumentCatalog().getAllPages().get(i - 1), i);
+
+ String fbhtml = layoutStripper.getLayout().asHTML();
+ String fbtext = layoutStripper.getLayout().asText();
+
+ String fpfile = file.replaceFirst("%s", "fp");
+ String fifile = file.replaceFirst("%s", "fi");
+ String fhfile = file.replaceFirst("%s", "fh");
+
+ fbtext = html2text(fbtext);
+
+ String fbindex = text2index(fbtext);
+
+ try {
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(fpfile), "UTF8"));
+ out.write(fbtext);
+ out.close();
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(fifile), "UTF8"));
+ out.write(fbindex);
+ out.close();
+ out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(fhfile), "UTF8"));
+ out.write(fbhtml);
+ out.close();
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+ }
+ }
+
+ protected String html2text(String text) {
+ text = StringUtil.separateLigatures(text);
+ text = text.toLowerCase();
+ text = StringUtil.removeControl(text);
+ text = StringUtil.removePoints(text, ignoredSeparators);
+ text = StringUtil.condenseWhite(text);
+ text = StringUtil.trim(text, trimchars);
+ return text;
+ }
+
+ protected String cleanhtml(String html) {
+ return StringUtil.normalizeWhite(html);
+ }
+
+ protected String text2index(String text) {
+ TreeMap<String, Word> index = new TreeMap<>();
+
+ String[] words = text.split(" ");
+ for (String word : words) {
+ String woa = StringUtil.removeAccents(word);
+ if (index.containsKey(woa)) {
+ index.get(woa).addWord(word);
+ } else {
+ Word w = new Word();
+ w.addWord(word);
+ index.put(woa, w);
+ }
+ }
+ String indexString = "";
+
+ for (Entry<String, Word> e : index.entrySet()) {
+ indexString += e.getKey() + "," + e.getValue().toString()
+ + "\n";
+ }
+ if (indexString.length() > 0) {
+ indexString = indexString.substring(0,
+ indexString.length() - 1);
+ }
+
+ return indexString;
+ }
}
protected PDPage currentPage;
protected String ignoredSeparators;
+ protected boolean robust = false;
public Page layout;
public LayoutStripper() throws IOException {
protected String normalizeSpace(String c) {
int code = c.codePointAt(0);
- if (code == 8201 || code <= 8 ) {
+ if (code == 8201 || code <= 8) {
return " ";
}
return c;
}
+ public void setRobust(boolean robust) {
+ this.robust = robust;
+ }
+
public Page getLayout() {
return layout;
}