]> _ Git - cubeextranet.git/commitdiff
(no commit message)
authorvincent@cubedesigners.com <vincent@cubedesigners.com@f5622870-0f3c-0410-866d-9cb505b7a8ef>
Sun, 4 Sep 2011 21:57:18 +0000 (21:57 +0000)
committervincent@cubedesigners.com <vincent@cubedesigners.com@f5622870-0f3c-0410-866d-9cb505b7a8ef>
Sun, 4 Sep 2011 21:57:18 +0000 (21:57 +0000)
_project/Rapport.docx
fluidbook/tools/convert.pe
fluidbook/tools/fwstk/src/org/apache/pdfbox/util/PDFStreamEngine.java
inc/ws/Controlleur/class.ws.url.php
inc/ws/Metier/class.ws.document.php
inc/ws/Util/_common.php
inc/ws/Util/class.ws.pdf.fontextractor.php [new file with mode: 0644]

index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3048e0292ae943bb9e4c3e8d8fe71b412763cec4 100644 (file)
Binary files a/_project/Rapport.docx and b/_project/Rapport.docx differ
index 5980204b0ae8d3d281a871a9abc01eae5f9a57c3..7884e2377efe825d0f0b51b145f8c1143a174f35 100644 (file)
@@ -16,9 +16,10 @@ if($argv[i]!="-")
        while(j<SizeOf(glyphs))
                if(glyphs[j]!="")
                        glyph=StrSplit(glyphs[j],"      ")
-                       u=Strtol(glyph[0],16)
-                       Select(glyph[1])
-                       SetUnicodeValue(UCodePoint(u))
+                       ufrom=Strtol(glyph[0],16)
+                       uto=Strtol(glyph[1],16)
+                       Select(ufrom)
+                       SetUnicodeValue(UCodePoint(uto))
                        j++
                endif
        endloop
index b58fc466850f9f4e9dc18bb333c199c091c4fb58..93a246611517fe74de179fd2ad2fb3280697a217 100644 (file)
@@ -143,7 +143,6 @@ public class PDFStreamEngine {
                                                        (OperatorProcessor) klass.newInstance();
                                        registerOperatorProcessor(operator, processor);
                                } catch (Exception e) {
-                                       
                                }
                        }
                }
@@ -285,7 +284,6 @@ public class PDFStreamEngine {
         * @param str The string to be processed.
         */
        protected String inspectFontEncoding(String str) {
-               System.out.println(str);
                return str;
        }
 
@@ -474,6 +472,8 @@ public class PDFStreamEngine {
                        }
                        totalCharCnt++;
 
+                       c = mergeLigatures(c);
+
                        float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * textMatrix.getYScale();
 
                        // process the decoded text
@@ -496,6 +496,34 @@ public class PDFStreamEngine {
                }
        }
 
+       public String mergeLigatures(String c) {
+               if (c.length() <= 1) {
+                       return c;
+               }
+               if (c.length() >= 2) {
+
+                       HashMap<String, String> ligatures = new HashMap<>();
+                       ligatures.put("AE", "\u00C6");
+                       ligatures.put("ae", "\u00E6");
+                       ligatures.put("OE", "\u0152");
+                       ligatures.put("oe", "\u0153");
+                       ligatures.put("IJ", "\u0132");
+                       ligatures.put("ij", "\u0133");
+                       ligatures.put("ff", "\ufb00");
+                       ligatures.put("fi", "\ufb01");
+                       ligatures.put("fl", "\ufb02");
+                       ligatures.put("ffi", "\ufb03");
+                       ligatures.put("ffl", "\ufb04");
+
+                       if (ligatures.containsKey(c)) {
+                               return ligatures.get(c);
+                       }
+               }
+
+
+               return c;
+       }
+
        public void updateCmaps(String c, byte b, PDFont font) {
        }
 
@@ -512,7 +540,7 @@ public class PDFStreamEngine {
                        PDFOperator oper = PDFOperator.getOperator(operation);
                        processOperator(oper, arguments);
                } catch (IOException e) {
-                       log.warn(e, e);
+                       //log.warn(e, e);
                }
        }
 
@@ -533,12 +561,12 @@ public class PDFStreamEngine {
                                processor.process(operator, arguments);
                        } else {
                                if (!unsupportedOperators.contains(operation)) {
-                                       log.info("unsupported/disabled operation: " + operation);
+                                       //log.info("unsupported/disabled operation: " + operation);
                                        unsupportedOperators.add(operation);
                                }
                        }
                } catch (Exception e) {
-                       log.warn(e, e);
+                       //log.warn(e, e);
                }
        }
 
index ebae1ddc6dac0126e0957926e8ed4f425430ce95..b313c0ede15b7db9ac165c0af856b2a6e14597e6 100644 (file)
@@ -978,6 +978,20 @@ html{height:100%}' . "\n";
                $dao->updateFromObject($doc);\r
        }\r
 \r
+       public static function testFontExtraction($args) {\r
+               $documentId = $args[1];\r
+\r
+               header('Content-type: text/plain');\r
+               ob_clean();\r
+               \r
+               \r
+               \r
+               $extractor = new wsPDFFontExtractor(WS_DOCS . '/' . $documentId);\r
+               $extractor->extract();\r
+\r
+               exit;\r
+       }\r
+\r
        public static function getParamsHelp($args) {\r
                commonDroits::min(5);\r
                global $core;\r
index 5287da6f77d2fd39be6ffc15aae544fb99a473a2..f16fa8fb2c9bcfaf1643c29955499480efc2567b 100644 (file)
@@ -95,72 +95,8 @@ class wsDocument extends cubeMetier {
        }\r
 \r
        public function extractFonts() {\r
-               $out = $this->out . 'fonts/pdf';\r
-               if (!file_exists($out)) {\r
-                       mkdir($out, 0777, true);\r
-               }\r
-               // Extract fonts from PDF\r
-               /* $gs = new cubeCommandLine('gs');\r
-                 $gs->setPath(CONVERTER_PATH);\r
-                 $gs->cd($out);\r
-                 $gs->setArg('-dBATCH');\r
-                 $gs->setArg('-dNOPAUSE');\r
-                 $gs->setArg('-dNOPROMPT');\r
-                 $gs->setArg('-dNODISPLAY');\r
-                 $gs->setArg(null, WS_TOOLS . '/extractFonts.ps');\r
-                 $gs->setManualArg('-c "(' . $this->in . ') extractFonts quit"');\r
-\r
-                 $gs->execute();\r
-                 $this->addToLog($gs); */\r
-\r
-               $mupdf = new cubeCommandLine('/usr/local/mupdf/pdfextract');\r
-               $mupdf->setPath(CONVERTER_PATH);\r
-               $mupdf->cd($out);\r
-               $mupdf->setArg(null, $this->in);\r
-               $mupdf->execute();\r
-               $this->addToLog($mupdf);\r
-\r
-\r
-               `rm $out/fonts/*.png`;\r
-               `rm $out/fonts/*.pam`;\r
-               $dr = opendir($out);\r
-\r
-               if (!file_exists($this->out . '/fonts/web')) {\r
-                       mkdir($this->out . '/fonts/web', 0777, true);\r
-               }\r
-               $images = array('pnm', 'jpg', 'jpeg', 'png', 'pam');\r
-               // Fonts conversion\r
-               $collections = array();\r
-               while ($file = readdir($dr)) {\r
-                       if ($file == '.' || $file == '..' || in_array(files::getExtension($file), $images)) {\r
-                               continue;\r
-                       }\r
-\r
-                       $e = explode('.', $file);\r
-                       array_pop($e);\r
-                       $fname = implode('.', $e);\r
-\r
-                       $e = explode('-', $fname);\r
-                       array_pop($e);\r
-                       $fname = implode('-', $e);\r
-\r
-                       if (!isset($collections[$fname])) {\r
-                               $collections[$fname] = array();\r
-                       }\r
-                       $collections[$fname][] = $file;\r
-               }\r
-\r
-               foreach ($collections as $fontname => $files) {\r
-                       $fontforge = new cubeCommandLine('convert.pe');\r
-                       $fontforge->setPath(CONVERTER_PATH);\r
-                       foreach ($files as $file) {\r
-                               $fontforge->setArg(null, $out . '/' . $file);\r
-                       }\r
-                       $fontforge->setArg(null, $this->out . '/html/' . $fontname . '.cmap');\r
-                       $fontforge->setArg(null, $this->out . '/fonts/web/' . $fontname . '.ttf');\r
-                       $fontforge->execute();\r
-                       $this->addToLog($fontforge);\r
-               }\r
+               $extractor = new wsPDFFontExtractor($this->out, $this);\r
+               $extractor->extract();\r
        }\r
 \r
        public function getInfos($in = null, $force = false) {\r
@@ -319,8 +255,8 @@ class wsDocument extends cubeMetier {
                if ($this->CropAndCut()) {\r
                        $this->getInfos($this->cropped, true);\r
                }\r
-               $this->getLinksAndTexts();\r
                $this->extractFonts();\r
+               $this->getLinksAndTexts();\r
        }\r
 \r
        public function CropAndCut() {\r
index 5753a4fe28e7b1e67dbf629e9840da9c9b21c8a6..65010ba508544ba8d36e0f10cf1ccb700f57335a 100644 (file)
@@ -11,4 +11,5 @@ $__autoload['wsSecureSWF'] = dirname(__FILE__) . '/class.ws.secure.swf.php';
 $__autoload['wsTools'] = dirname(__FILE__) . '/class.ws.tools.php';\r
 $__autoload['wsHTML5Compiler'] = dirname(__FILE__) . '/html5/class.ws.html5.compiler.php';\r
 $__autoload['wsHTML5Link'] = dirname(__FILE__) . '/html5/class.ws.html5.links.php';\r
+$__autoload['wsPDFFontExtractor'] = dirname(__FILE__) . '/class.ws.pdf.fontextractor.php';\r
 ?>
\ No newline at end of file
diff --git a/inc/ws/Util/class.ws.pdf.fontextractor.php b/inc/ws/Util/class.ws.pdf.fontextractor.php
new file mode 100644 (file)
index 0000000..6afff2a
--- /dev/null
@@ -0,0 +1,343 @@
+<?php
+
+class wsPDFFontExtractor {
+
+       protected $in;
+       protected $outpdf;
+       protected $outweb;
+       protected $fonts = array();
+       protected $cmaps = array();
+       protected $doc;
+
+       public function __construct($in, $doc=null) {
+               $this->doc = $doc;
+               $this->in = $in . '/original.pdf';
+               $this->outpdf = $in . '/fonts/pdf';
+               $this->outweb = $in . '/fonts/web';
+       }
+
+       public function extract() {
+               $this->clean();
+
+               $this->getFonts();
+               $this->getFontsDescriptors();
+               $this->extractFonts();
+               $this->extractCmaps();
+               $this->convertToTTF();
+       }
+
+       public function convertToTTF() {
+
+
+
+               $dr = opendir($this->outpdf);
+               while ($file = readdir($dr)) {
+                       if ($file == '.' || $file == '..' || files::getExtension($file) == 'cmap') {
+                               continue;
+                       }
+
+                       $e = explode('.', $file);
+                       array_pop($e);
+                       $fname = implode('.', $e);
+
+                       $e = explode('-', $fname);
+                       array_pop($e);
+                       $fname = implode('-', $e);
+
+                       if (!isset($collections[$fname])) {
+                               $collections[$fname] = array();
+                       }
+                       $collections[$fname][] = $file;
+               }
+
+               foreach ($collections as $fontname => $files) {
+                       $fontforge = new cubeCommandLine('convert.pe');
+                       $fontforge->setPath(CONVERTER_PATH);
+                       foreach ($files as $file) {
+                               $fontforge->setArg(null, $this->outpdf . '/' . $file);
+                       }
+                       $cmapFile = $this->outpdf . '/' . $fontname . '.cmap';
+                       if (!file_exists($cmapFile)) {
+                               $fontforge->setArg(null, '-');
+                       } else {
+                               $fontforge->setArg(null, $cmapFile);
+                       }
+                       $fontforge->setArg(null, $this->outweb . '/' . $fontname . '.ttf');
+                       $fontforge->execute();
+                       $this->addToLog($fontforge);
+               }
+       }
+
+       public function getFonts() {
+               $pdfinfo = $this->getCommandLine('pdfinfo');
+               $pdfinfo->setArg('f');
+               $pdfinfo->setArg(null, $this->in);
+               $pdfinfo->execute();
+
+               $this->addToLog($pdfinfo);
+
+               /*
+                * /home/extranet/www/fluidbook/docs/100376/original.pdf:
+
+                 PDF-1.7
+                 Info object (9177 0 R):
+                 <<
+                 /CreationDate (D:20110831090005+02'00')
+                 /Creator (Adobe InDesign CS5 \(7.0\))
+                 /ModDate (D:20110901144502+02'00')
+                 /Producer (Adobe PDF Library 9.9)
+                 /Trapped /False
+                 >>
+
+                 Pages: 116
+
+                 Retrieving info from pages 1-116...
+                 Fonts (201):
+                 1 (   9180 0 R): Type1 'QGNYEE+AvantGarde-Book' (9193 0 R)
+                 1 (   9180 0 R): Type1 'QGNYEE+AvantGarde-ExtraLight' (9195 0 R)
+                 2 (      1 0 R): Type1 'QGNYEE+DIN-Light' (7605 0 R)
+                 2 (      1 0 R): Type1 'QGNYEE+AvantGarde-ExtraLightObl' (7608 0 R)
+                 2 (      1 0 R): Type1 'QGNYEE+AvantGarde-MediumObl' (7601 0 R)
+
+                */
+
+               if (preg_match_all("|\s+\d+\s+\(\s+\d+\s+\d+\s+\w+\):\s+\w+\s+'([A-Za-z-+]+)'\s+\((\d+)\s+\d+\s+\w+\)\s|", $pdfinfo->output, $matches)) {
+                       foreach ($matches[1] as $k => $name) {
+                               $this->fonts[$matches[2][$k]] = array('name' => $name);
+                       }
+               }
+       }
+
+       protected function getFontsDescriptors() {
+               $pdfshow = $this->getCommandLine('pdfshow');
+               $pdfshow->setArg(null, $this->in);
+               $pdfshow->setManualArg(implode(' ', array_keys($this->fonts)));
+               $pdfshow->execute();
+               //$this->doc->addToLog($pdfshow);
+
+               /*
+                 8677 0 obj
+                 <<
+                 /BaseFont /YIKUMO+Futura-Condensed
+                 /Encoding /WinAnsiEncoding
+                 /FirstChar 32
+                 /FontDescriptor 7690 0 R
+                 /LastChar 119
+                 /Subtype /Type1
+                 /Type /Font
+                 /Widths [ 205 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+                 0 0 0 0 0 0 0 0 0 414 418 0 466 0 342 0 0 0 303 0 0 561 0
+                 0 388 0 0 354 0 0 0 0 0 0 0 0 0 0 0 0 0 380 380 0 380 361
+                 0 382 388 180 0 395 180 590 388 375 380 0 269 276 221 389
+                 0 487 ]
+                 >>
+                 endobj
+
+                 9193 0 obj
+                 <<
+                 /BaseFont /QGNYEE+AvantGarde-Book
+                 /Encoding /WinAnsiEncoding
+                 /FirstChar 32
+                 /FontDescriptor 9192 0 R
+                 /LastChar 201
+                 /Subtype /Type1
+                 /ToUnicode 9185 0 R
+                 /Type /Font
+                 /Widths [ 277 0 0 0 0 0 0 0 0 0 0 0 277 332 0 0 554 554 554
+                 0 0 0 554 554 0 0 0 0 0 0 0 0 0 740 574 813 744 536 485 872
+                 683 226 482 0 462 919 740 869 592 0 607 498 426 655 702 0
+                 0 592 480 0 0 0 0 0 0 683 0 0 0 650 0 0 0 200 0 0 200 0 610
+                 655 0 0 301 388 339 0 554 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000
+                 0 0 0 0 0 0 0 0 0 0 0 0 351 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+                 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+                 0 740 0 0 0 0 0 0 0 0 536 ]
+                 >>
+                 endobj
+                */
+
+               $lines = explode("\n", $pdfshow->output);
+               $currentFont = null;
+               foreach ($lines as $line) {
+                       $line = trim($line);
+                       if ($line == '' || $line == '<<' || $line == '>>') {
+                               continue;
+                       }
+                       if ($line == 'endobj') {
+                               $currentFont = null;
+                               continue;
+                       }
+                       $e = explode(' ', $line);
+                       if (count($e)==3 && $e[2] == 'obj') {
+                               $currentFont = $e[0];
+                               continue;
+                       }
+
+                       if (is_null($currentFont)) {
+                               continue;
+                       }
+
+                       $fontname = $this->fonts[$currentFont]['name'];
+
+                       if ($e[0] == '/ToUnicode') {
+                               $this->fonts[$currentFont]['cmap'] = $e[1];
+                               $this->cmaps[$fontname] = $e[1];
+                       }
+               }
+
+               $this->addToLog(print_r($this->fonts, true));
+       }
+
+       protected function extractFonts() {
+               $descriptors = $this->getUniqueId('descriptor');
+
+               $pdfextract = $this->getCommandLine('pdfextract');
+               $pdfextract->setArg(null, $this->in);
+               $pdfextract->execute();
+               $this->addToLog($pdfextract);
+
+
+               $formats = array('pnm', 'pam', 'pgm', 'jpg', 'gif', 'jpeg', 'png');
+               foreach ($formats as $f) {
+                       `rm $this->outpdf/*.$f`;
+               }
+       }
+
+       protected function extractCmaps() {
+               $cmaps = array_unique($this->cmaps);
+
+               $pdfshow = $this->getCommandLine('pdfshow');
+               $pdfshow->setArg(null, $this->in);
+               $pdfshow->setManualArg(implode(' ', $cmaps));
+               $pdfshow->execute();
+               $this->addToLog($pdfshow);
+
+
+               /* 5266 0 obj
+                 <<
+                 /Filter /FlateDecode
+                 /Length 583
+                 >>
+                 stream
+                 /CIDInit /ProcSet findresource begin
+                 12 dict begin
+                 begincmap
+                 /CIDSystemInfo
+                 << /Registry (Adobe)
+                 /Ordering (UCS) /Supplement 0 >> def
+                 /CMapName /Adobe-Identity-UCS def
+                 /CMapType 2 def
+                 1 begincodespacerange
+                 <0000> <FFFF>
+                 endcodespacerange
+                 81 beginbfchar
+                 <0000> <0020>
+                 <0001> <0021>
+                 <0005> <0025>
+                 <0007> <2019>
+                 <0008> <0028>
+                 <0009> <0029>
+                 <000C> <002C>
+                 <000D> <002D>
+                 <000E> <002E>
+                 <000F> <002F>
+                 endbfchar
+                 endcmap CMapName currentdict /CMap defineresource pop end end
+                 endstream
+                 endobj
+
+                */
+
+               $finalCmaps = array();
+
+               $lines = explode("\n", $pdfshow->output);
+               $currentCmap = null;
+               $inMap = false;
+               foreach ($lines as $line) {
+                       $line = trim($line);
+                       if ($line == '') {
+                               continue;
+                       }
+                       if ($line == 'endobj') {
+                               $currentCmap = null;
+                               continue;
+                       } elseif ($line == 'endbfchar') {
+                               $inMap = false;
+                       }
+
+                       $e = explode(' ', $line);
+                       if (count($e)==3 && $e[2] == 'obj') {
+                               $currentCmap = $e[0];
+                               continue;
+                       }
+
+                       if (count($e)==1 && $e[1] == 'beginbfchar') {
+                               $inMap = true;
+                               continue;
+                       }
+
+                       if (!$inMap || is_null($currentCmap)) {
+                               continue;
+                       }
+
+                       $fonts = array_keys($this->cmaps, $currentCmap);
+
+                       $from = trim($e[0], '<>');
+                       $to = trim($e[1], '<>');
+
+                       foreach ($fonts as $fontname) {
+                               if (!isset($finalCmaps[$fontname])) {
+                                       $finalCmaps[$fontname] = array();
+                               }
+
+                               $finalCmaps[$fontname][$from] = $to;
+                       }
+               }
+
+               foreach ($finalCmaps as $fontname => $cmap) {
+                       arsort($cmap);
+                       $data = '';
+                       foreach ($cmap as $from => $to) {
+                               $data.=$from . "\t" . $to . "\n";
+                       }
+                       file_put_contents($this->outpdf . '/' . $fontname . '.cmap', $data);
+               }
+       }
+
+       protected function getUniqueId($param) {
+               $res = array();
+               foreach ($this->fonts as $f) {
+                       if (isset($f[$param])) {
+                               $res[$id] = $f[$param];
+                       }
+               }
+               return array_unique($res);
+       }
+
+       protected function clean() {
+               `rm -rf $this->outpdf`;
+               `rm -rf $this->outweb`;
+               mkdir($this->outweb, 0777, true);
+               mkdir($this->outpdf, 0777, true);
+       }
+
+       protected function getCommandLine($program) {
+               $res = new cubeCommandLine('/usr/local/mupdf/' . $program);
+               $res->cd($this->outpdf);
+               $res->setPath(CONVERTER_PATH);
+               return $res;
+       }
+
+       protected function addToLog($tolog) {
+               if (!is_null($this->doc)) {
+                       $this->doc->addToLog($tolog);
+               } elseif ($tolog instanceof cubeCommandLine) {
+                       echo $tolog->commande . "\n\n";
+                       echo $tolog->output . "\n\n";
+               } else {
+                       echo $tolog . "\n\n";
+               }
+       }
+
+}
+
+?>