(OperatorProcessor) klass.newInstance();
registerOperatorProcessor(operator, processor);
} catch (Exception e) {
-
}
}
}
* @param str The string to be processed.
*/
protected String inspectFontEncoding(String str) {
- System.out.println(str);
return str;
}
}
totalCharCnt++;
+ c = mergeLigatures(c);
+
float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * textMatrix.getYScale();
// process the decoded text
}
}
+ public String mergeLigatures(String c) {
+ if (c.length() <= 1) {
+ return c;
+ }
+ if (c.length() >= 2) {
+
+ HashMap<String, String> ligatures = new HashMap<>();
+ ligatures.put("AE", "\u00C6");
+ ligatures.put("ae", "\u00E6");
+ ligatures.put("OE", "\u0152");
+ ligatures.put("oe", "\u0153");
+ ligatures.put("IJ", "\u0132");
+ ligatures.put("ij", "\u0133");
+ ligatures.put("ff", "\ufb00");
+ ligatures.put("fi", "\ufb01");
+ ligatures.put("fl", "\ufb02");
+ ligatures.put("ffi", "\ufb03");
+ ligatures.put("ffl", "\ufb04");
+
+ if (ligatures.containsKey(c)) {
+ return ligatures.get(c);
+ }
+ }
+
+
+ return c;
+ }
+
public void updateCmaps(String c, byte b, PDFont font) {
}
PDFOperator oper = PDFOperator.getOperator(operation);
processOperator(oper, arguments);
} catch (IOException e) {
- log.warn(e, e);
+ //log.warn(e, e);
}
}
processor.process(operator, arguments);
} else {
if (!unsupportedOperators.contains(operation)) {
- log.info("unsupported/disabled operation: " + operation);
+ //log.info("unsupported/disabled operation: " + operation);
unsupportedOperators.add(operation);
}
}
} catch (Exception e) {
- log.warn(e, e);
+ //log.warn(e, e);
}
}
}\r
\r
public function extractFonts() {\r
- $out = $this->out . 'fonts/pdf';\r
- if (!file_exists($out)) {\r
- mkdir($out, 0777, true);\r
- }\r
- // Extract fonts from PDF\r
- /* $gs = new cubeCommandLine('gs');\r
- $gs->setPath(CONVERTER_PATH);\r
- $gs->cd($out);\r
- $gs->setArg('-dBATCH');\r
- $gs->setArg('-dNOPAUSE');\r
- $gs->setArg('-dNOPROMPT');\r
- $gs->setArg('-dNODISPLAY');\r
- $gs->setArg(null, WS_TOOLS . '/extractFonts.ps');\r
- $gs->setManualArg('-c "(' . $this->in . ') extractFonts quit"');\r
-\r
- $gs->execute();\r
- $this->addToLog($gs); */\r
-\r
- $mupdf = new cubeCommandLine('/usr/local/mupdf/pdfextract');\r
- $mupdf->setPath(CONVERTER_PATH);\r
- $mupdf->cd($out);\r
- $mupdf->setArg(null, $this->in);\r
- $mupdf->execute();\r
- $this->addToLog($mupdf);\r
-\r
-\r
- `rm $out/fonts/*.png`;\r
- `rm $out/fonts/*.pam`;\r
- $dr = opendir($out);\r
-\r
- if (!file_exists($this->out . '/fonts/web')) {\r
- mkdir($this->out . '/fonts/web', 0777, true);\r
- }\r
- $images = array('pnm', 'jpg', 'jpeg', 'png', 'pam');\r
- // Fonts conversion\r
- $collections = array();\r
- while ($file = readdir($dr)) {\r
- if ($file == '.' || $file == '..' || in_array(files::getExtension($file), $images)) {\r
- continue;\r
- }\r
-\r
- $e = explode('.', $file);\r
- array_pop($e);\r
- $fname = implode('.', $e);\r
-\r
- $e = explode('-', $fname);\r
- array_pop($e);\r
- $fname = implode('-', $e);\r
-\r
- if (!isset($collections[$fname])) {\r
- $collections[$fname] = array();\r
- }\r
- $collections[$fname][] = $file;\r
- }\r
-\r
- foreach ($collections as $fontname => $files) {\r
- $fontforge = new cubeCommandLine('convert.pe');\r
- $fontforge->setPath(CONVERTER_PATH);\r
- foreach ($files as $file) {\r
- $fontforge->setArg(null, $out . '/' . $file);\r
- }\r
- $fontforge->setArg(null, $this->out . '/html/' . $fontname . '.cmap');\r
- $fontforge->setArg(null, $this->out . '/fonts/web/' . $fontname . '.ttf');\r
- $fontforge->execute();\r
- $this->addToLog($fontforge);\r
- }\r
+ $extractor = new wsPDFFontExtractor($this->out, $this);\r
+ $extractor->extract();\r
}\r
\r
public function getInfos($in = null, $force = false) {\r
if ($this->CropAndCut()) {\r
$this->getInfos($this->cropped, true);\r
}\r
- $this->getLinksAndTexts();\r
$this->extractFonts();\r
+ $this->getLinksAndTexts();\r
}\r
\r
public function CropAndCut() {\r
--- /dev/null
+<?php
+
+class wsPDFFontExtractor {
+
+ protected $in;
+ protected $outpdf;
+ protected $outweb;
+ protected $fonts = array();
+ protected $cmaps = array();
+ protected $doc;
+
+ public function __construct($in, $doc=null) {
+ $this->doc = $doc;
+ $this->in = $in . '/original.pdf';
+ $this->outpdf = $in . '/fonts/pdf';
+ $this->outweb = $in . '/fonts/web';
+ }
+
+ public function extract() {
+ $this->clean();
+
+ $this->getFonts();
+ $this->getFontsDescriptors();
+ $this->extractFonts();
+ $this->extractCmaps();
+ $this->convertToTTF();
+ }
+
+ public function convertToTTF() {
+
+
+
+ $dr = opendir($this->outpdf);
+ while ($file = readdir($dr)) {
+ if ($file == '.' || $file == '..' || files::getExtension($file) == 'cmap') {
+ continue;
+ }
+
+ $e = explode('.', $file);
+ array_pop($e);
+ $fname = implode('.', $e);
+
+ $e = explode('-', $fname);
+ array_pop($e);
+ $fname = implode('-', $e);
+
+ if (!isset($collections[$fname])) {
+ $collections[$fname] = array();
+ }
+ $collections[$fname][] = $file;
+ }
+
+ foreach ($collections as $fontname => $files) {
+ $fontforge = new cubeCommandLine('convert.pe');
+ $fontforge->setPath(CONVERTER_PATH);
+ foreach ($files as $file) {
+ $fontforge->setArg(null, $this->outpdf . '/' . $file);
+ }
+ $cmapFile = $this->outpdf . '/' . $fontname . '.cmap';
+ if (!file_exists($cmapFile)) {
+ $fontforge->setArg(null, '-');
+ } else {
+ $fontforge->setArg(null, $cmapFile);
+ }
+ $fontforge->setArg(null, $this->outweb . '/' . $fontname . '.ttf');
+ $fontforge->execute();
+ $this->addToLog($fontforge);
+ }
+ }
+
+ public function getFonts() {
+ $pdfinfo = $this->getCommandLine('pdfinfo');
+ $pdfinfo->setArg('f');
+ $pdfinfo->setArg(null, $this->in);
+ $pdfinfo->execute();
+
+ $this->addToLog($pdfinfo);
+
+ /*
+ * /home/extranet/www/fluidbook/docs/100376/original.pdf:
+
+ PDF-1.7
+ Info object (9177 0 R):
+ <<
+ /CreationDate (D:20110831090005+02'00')
+ /Creator (Adobe InDesign CS5 \(7.0\))
+ /ModDate (D:20110901144502+02'00')
+ /Producer (Adobe PDF Library 9.9)
+ /Trapped /False
+ >>
+
+ Pages: 116
+
+ Retrieving info from pages 1-116...
+ Fonts (201):
+ 1 ( 9180 0 R): Type1 'QGNYEE+AvantGarde-Book' (9193 0 R)
+ 1 ( 9180 0 R): Type1 'QGNYEE+AvantGarde-ExtraLight' (9195 0 R)
+ 2 ( 1 0 R): Type1 'QGNYEE+DIN-Light' (7605 0 R)
+ 2 ( 1 0 R): Type1 'QGNYEE+AvantGarde-ExtraLightObl' (7608 0 R)
+ 2 ( 1 0 R): Type1 'QGNYEE+AvantGarde-MediumObl' (7601 0 R)
+
+ */
+
+ if (preg_match_all("|\s+\d+\s+\(\s+\d+\s+\d+\s+\w+\):\s+\w+\s+'([A-Za-z-+]+)'\s+\((\d+)\s+\d+\s+\w+\)\s|", $pdfinfo->output, $matches)) {
+ foreach ($matches[1] as $k => $name) {
+ $this->fonts[$matches[2][$k]] = array('name' => $name);
+ }
+ }
+ }
+
+ protected function getFontsDescriptors() {
+ $pdfshow = $this->getCommandLine('pdfshow');
+ $pdfshow->setArg(null, $this->in);
+ $pdfshow->setManualArg(implode(' ', array_keys($this->fonts)));
+ $pdfshow->execute();
+ //$this->doc->addToLog($pdfshow);
+
+ /*
+ 8677 0 obj
+ <<
+ /BaseFont /YIKUMO+Futura-Condensed
+ /Encoding /WinAnsiEncoding
+ /FirstChar 32
+ /FontDescriptor 7690 0 R
+ /LastChar 119
+ /Subtype /Type1
+ /Type /Font
+ /Widths [ 205 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0 0 414 418 0 466 0 342 0 0 0 303 0 0 561 0
+ 0 388 0 0 354 0 0 0 0 0 0 0 0 0 0 0 0 0 380 380 0 380 361
+ 0 382 388 180 0 395 180 590 388 375 380 0 269 276 221 389
+ 0 487 ]
+ >>
+ endobj
+
+ 9193 0 obj
+ <<
+ /BaseFont /QGNYEE+AvantGarde-Book
+ /Encoding /WinAnsiEncoding
+ /FirstChar 32
+ /FontDescriptor 9192 0 R
+ /LastChar 201
+ /Subtype /Type1
+ /ToUnicode 9185 0 R
+ /Type /Font
+ /Widths [ 277 0 0 0 0 0 0 0 0 0 0 0 277 332 0 0 554 554 554
+ 0 0 0 554 554 0 0 0 0 0 0 0 0 0 740 574 813 744 536 485 872
+ 683 226 482 0 462 919 740 869 592 0 607 498 426 655 702 0
+ 0 592 480 0 0 0 0 0 0 683 0 0 0 650 0 0 0 200 0 0 200 0 610
+ 655 0 0 301 388 339 0 554 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000
+ 0 0 0 0 0 0 0 0 0 0 0 0 351 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ 0 740 0 0 0 0 0 0 0 0 536 ]
+ >>
+ endobj
+ */
+
+ $lines = explode("\n", $pdfshow->output);
+ $currentFont = null;
+ foreach ($lines as $line) {
+ $line = trim($line);
+ if ($line == '' || $line == '<<' || $line == '>>') {
+ continue;
+ }
+ if ($line == 'endobj') {
+ $currentFont = null;
+ continue;
+ }
+ $e = explode(' ', $line);
+ if (count($e)==3 && $e[2] == 'obj') {
+ $currentFont = $e[0];
+ continue;
+ }
+
+ if (is_null($currentFont)) {
+ continue;
+ }
+
+ $fontname = $this->fonts[$currentFont]['name'];
+
+ if ($e[0] == '/ToUnicode') {
+ $this->fonts[$currentFont]['cmap'] = $e[1];
+ $this->cmaps[$fontname] = $e[1];
+ }
+ }
+
+ $this->addToLog(print_r($this->fonts, true));
+ }
+
+ protected function extractFonts() {
+ $descriptors = $this->getUniqueId('descriptor');
+
+ $pdfextract = $this->getCommandLine('pdfextract');
+ $pdfextract->setArg(null, $this->in);
+ $pdfextract->execute();
+ $this->addToLog($pdfextract);
+
+
+ $formats = array('pnm', 'pam', 'pgm', 'jpg', 'gif', 'jpeg', 'png');
+ foreach ($formats as $f) {
+ `rm $this->outpdf/*.$f`;
+ }
+ }
+
+ protected function extractCmaps() {
+ $cmaps = array_unique($this->cmaps);
+
+ $pdfshow = $this->getCommandLine('pdfshow');
+ $pdfshow->setArg(null, $this->in);
+ $pdfshow->setManualArg(implode(' ', $cmaps));
+ $pdfshow->execute();
+ $this->addToLog($pdfshow);
+
+
+ /* 5266 0 obj
+ <<
+ /Filter /FlateDecode
+ /Length 583
+ >>
+ stream
+ /CIDInit /ProcSet findresource begin
+ 12 dict begin
+ begincmap
+ /CIDSystemInfo
+ << /Registry (Adobe)
+ /Ordering (UCS) /Supplement 0 >> def
+ /CMapName /Adobe-Identity-UCS def
+ /CMapType 2 def
+ 1 begincodespacerange
+ <0000> <FFFF>
+ endcodespacerange
+ 81 beginbfchar
+ <0000> <0020>
+ <0001> <0021>
+ <0005> <0025>
+ <0007> <2019>
+ <0008> <0028>
+ <0009> <0029>
+ <000C> <002C>
+ <000D> <002D>
+ <000E> <002E>
+ <000F> <002F>
+ endbfchar
+ endcmap CMapName currentdict /CMap defineresource pop end end
+ endstream
+ endobj
+
+ */
+
+ $finalCmaps = array();
+
+ $lines = explode("\n", $pdfshow->output);
+ $currentCmap = null;
+ $inMap = false;
+ foreach ($lines as $line) {
+ $line = trim($line);
+ if ($line == '') {
+ continue;
+ }
+ if ($line == 'endobj') {
+ $currentCmap = null;
+ continue;
+ } elseif ($line == 'endbfchar') {
+ $inMap = false;
+ }
+
+ $e = explode(' ', $line);
+ if (count($e)==3 && $e[2] == 'obj') {
+ $currentCmap = $e[0];
+ continue;
+ }
+
+ if (count($e)==1 && $e[1] == 'beginbfchar') {
+ $inMap = true;
+ continue;
+ }
+
+ if (!$inMap || is_null($currentCmap)) {
+ continue;
+ }
+
+ $fonts = array_keys($this->cmaps, $currentCmap);
+
+ $from = trim($e[0], '<>');
+ $to = trim($e[1], '<>');
+
+ foreach ($fonts as $fontname) {
+ if (!isset($finalCmaps[$fontname])) {
+ $finalCmaps[$fontname] = array();
+ }
+
+ $finalCmaps[$fontname][$from] = $to;
+ }
+ }
+
+ foreach ($finalCmaps as $fontname => $cmap) {
+ arsort($cmap);
+ $data = '';
+ foreach ($cmap as $from => $to) {
+ $data.=$from . "\t" . $to . "\n";
+ }
+ file_put_contents($this->outpdf . '/' . $fontname . '.cmap', $data);
+ }
+ }
+
+ protected function getUniqueId($param) {
+ $res = array();
+ foreach ($this->fonts as $f) {
+ if (isset($f[$param])) {
+ $res[$id] = $f[$param];
+ }
+ }
+ return array_unique($res);
+ }
+
+ protected function clean() {
+ `rm -rf $this->outpdf`;
+ `rm -rf $this->outweb`;
+ mkdir($this->outweb, 0777, true);
+ mkdir($this->outpdf, 0777, true);
+ }
+
+ protected function getCommandLine($program) {
+ $res = new cubeCommandLine('/usr/local/mupdf/' . $program);
+ $res->cd($this->outpdf);
+ $res->setPath(CONVERTER_PATH);
+ return $res;
+ }
+
+ protected function addToLog($tolog) {
+ if (!is_null($this->doc)) {
+ $this->doc->addToLog($tolog);
+ } elseif ($tolog instanceof cubeCommandLine) {
+ echo $tolog->commande . "\n\n";
+ echo $tolog->output . "\n\n";
+ } else {
+ echo $tolog . "\n\n";
+ }
+ }
+
+}
+
+?>