From: vincent@cubedesigners.com Date: Sun, 4 Sep 2011 21:57:18 +0000 (+0000) Subject: (no commit message) X-Git-Url: http://git.cubedesigners.com/?a=commitdiff_plain;h=eaa37daef468cededc1adcdcd8cdb07180484a71;p=cubeextranet.git --- diff --git a/_project/Rapport.docx b/_project/Rapport.docx index e69de29bb..3048e0292 100644 Binary files a/_project/Rapport.docx and b/_project/Rapport.docx differ diff --git a/fluidbook/tools/convert.pe b/fluidbook/tools/convert.pe index 5980204b0..7884e2377 100644 --- a/fluidbook/tools/convert.pe +++ b/fluidbook/tools/convert.pe @@ -16,9 +16,10 @@ if($argv[i]!="-") while(j= 2) { + + HashMap ligatures = new HashMap<>(); + ligatures.put("AE", "\u00C6"); + ligatures.put("ae", "\u00E6"); + ligatures.put("OE", "\u0152"); + ligatures.put("oe", "\u0153"); + ligatures.put("IJ", "\u0132"); + ligatures.put("ij", "\u0133"); + ligatures.put("ff", "\ufb00"); + ligatures.put("fi", "\ufb01"); + ligatures.put("fl", "\ufb02"); + ligatures.put("ffi", "\ufb03"); + ligatures.put("ffl", "\ufb04"); + + if (ligatures.containsKey(c)) { + return ligatures.get(c); + } + } + + + return c; + } + public void updateCmaps(String c, byte b, PDFont font) { } @@ -512,7 +540,7 @@ public class PDFStreamEngine { PDFOperator oper = PDFOperator.getOperator(operation); processOperator(oper, arguments); } catch (IOException e) { - log.warn(e, e); + //log.warn(e, e); } } @@ -533,12 +561,12 @@ public class PDFStreamEngine { processor.process(operator, arguments); } else { if (!unsupportedOperators.contains(operation)) { - log.info("unsupported/disabled operation: " + operation); + //log.info("unsupported/disabled operation: " + operation); unsupportedOperators.add(operation); } } } catch (Exception e) { - log.warn(e, e); + //log.warn(e, e); } } diff --git a/inc/ws/Controlleur/class.ws.url.php b/inc/ws/Controlleur/class.ws.url.php index ebae1ddc6..b313c0ede 100644 --- a/inc/ws/Controlleur/class.ws.url.php +++ b/inc/ws/Controlleur/class.ws.url.php @@ -978,6 +978,20 @@ html{height:100%}' . "\n"; $dao->updateFromObject($doc); } + public static function testFontExtraction($args) { + $documentId = $args[1]; + + header('Content-type: text/plain'); + ob_clean(); + + + + $extractor = new wsPDFFontExtractor(WS_DOCS . '/' . $documentId); + $extractor->extract(); + + exit; + } + public static function getParamsHelp($args) { commonDroits::min(5); global $core; diff --git a/inc/ws/Metier/class.ws.document.php b/inc/ws/Metier/class.ws.document.php index 5287da6f7..f16fa8fb2 100644 --- a/inc/ws/Metier/class.ws.document.php +++ b/inc/ws/Metier/class.ws.document.php @@ -95,72 +95,8 @@ class wsDocument extends cubeMetier { } public function extractFonts() { - $out = $this->out . 'fonts/pdf'; - if (!file_exists($out)) { - mkdir($out, 0777, true); - } - // Extract fonts from PDF - /* $gs = new cubeCommandLine('gs'); - $gs->setPath(CONVERTER_PATH); - $gs->cd($out); - $gs->setArg('-dBATCH'); - $gs->setArg('-dNOPAUSE'); - $gs->setArg('-dNOPROMPT'); - $gs->setArg('-dNODISPLAY'); - $gs->setArg(null, WS_TOOLS . '/extractFonts.ps'); - $gs->setManualArg('-c "(' . $this->in . ') extractFonts quit"'); - - $gs->execute(); - $this->addToLog($gs); */ - - $mupdf = new cubeCommandLine('/usr/local/mupdf/pdfextract'); - $mupdf->setPath(CONVERTER_PATH); - $mupdf->cd($out); - $mupdf->setArg(null, $this->in); - $mupdf->execute(); - $this->addToLog($mupdf); - - - `rm $out/fonts/*.png`; - `rm $out/fonts/*.pam`; - $dr = opendir($out); - - if (!file_exists($this->out . '/fonts/web')) { - mkdir($this->out . '/fonts/web', 0777, true); - } - $images = array('pnm', 'jpg', 'jpeg', 'png', 'pam'); - // Fonts conversion - $collections = array(); - while ($file = readdir($dr)) { - if ($file == '.' || $file == '..' || in_array(files::getExtension($file), $images)) { - continue; - } - - $e = explode('.', $file); - array_pop($e); - $fname = implode('.', $e); - - $e = explode('-', $fname); - array_pop($e); - $fname = implode('-', $e); - - if (!isset($collections[$fname])) { - $collections[$fname] = array(); - } - $collections[$fname][] = $file; - } - - foreach ($collections as $fontname => $files) { - $fontforge = new cubeCommandLine('convert.pe'); - $fontforge->setPath(CONVERTER_PATH); - foreach ($files as $file) { - $fontforge->setArg(null, $out . '/' . $file); - } - $fontforge->setArg(null, $this->out . '/html/' . $fontname . '.cmap'); - $fontforge->setArg(null, $this->out . '/fonts/web/' . $fontname . '.ttf'); - $fontforge->execute(); - $this->addToLog($fontforge); - } + $extractor = new wsPDFFontExtractor($this->out, $this); + $extractor->extract(); } public function getInfos($in = null, $force = false) { @@ -319,8 +255,8 @@ class wsDocument extends cubeMetier { if ($this->CropAndCut()) { $this->getInfos($this->cropped, true); } - $this->getLinksAndTexts(); $this->extractFonts(); + $this->getLinksAndTexts(); } public function CropAndCut() { diff --git a/inc/ws/Util/_common.php b/inc/ws/Util/_common.php index 5753a4fe2..65010ba50 100644 --- a/inc/ws/Util/_common.php +++ b/inc/ws/Util/_common.php @@ -11,4 +11,5 @@ $__autoload['wsSecureSWF'] = dirname(__FILE__) . '/class.ws.secure.swf.php'; $__autoload['wsTools'] = dirname(__FILE__) . '/class.ws.tools.php'; $__autoload['wsHTML5Compiler'] = dirname(__FILE__) . '/html5/class.ws.html5.compiler.php'; $__autoload['wsHTML5Link'] = dirname(__FILE__) . '/html5/class.ws.html5.links.php'; +$__autoload['wsPDFFontExtractor'] = dirname(__FILE__) . '/class.ws.pdf.fontextractor.php'; ?> \ No newline at end of file diff --git a/inc/ws/Util/class.ws.pdf.fontextractor.php b/inc/ws/Util/class.ws.pdf.fontextractor.php new file mode 100644 index 000000000..6afff2a6a --- /dev/null +++ b/inc/ws/Util/class.ws.pdf.fontextractor.php @@ -0,0 +1,343 @@ +doc = $doc; + $this->in = $in . '/original.pdf'; + $this->outpdf = $in . '/fonts/pdf'; + $this->outweb = $in . '/fonts/web'; + } + + public function extract() { + $this->clean(); + + $this->getFonts(); + $this->getFontsDescriptors(); + $this->extractFonts(); + $this->extractCmaps(); + $this->convertToTTF(); + } + + public function convertToTTF() { + + + + $dr = opendir($this->outpdf); + while ($file = readdir($dr)) { + if ($file == '.' || $file == '..' || files::getExtension($file) == 'cmap') { + continue; + } + + $e = explode('.', $file); + array_pop($e); + $fname = implode('.', $e); + + $e = explode('-', $fname); + array_pop($e); + $fname = implode('-', $e); + + if (!isset($collections[$fname])) { + $collections[$fname] = array(); + } + $collections[$fname][] = $file; + } + + foreach ($collections as $fontname => $files) { + $fontforge = new cubeCommandLine('convert.pe'); + $fontforge->setPath(CONVERTER_PATH); + foreach ($files as $file) { + $fontforge->setArg(null, $this->outpdf . '/' . $file); + } + $cmapFile = $this->outpdf . '/' . $fontname . '.cmap'; + if (!file_exists($cmapFile)) { + $fontforge->setArg(null, '-'); + } else { + $fontforge->setArg(null, $cmapFile); + } + $fontforge->setArg(null, $this->outweb . '/' . $fontname . '.ttf'); + $fontforge->execute(); + $this->addToLog($fontforge); + } + } + + public function getFonts() { + $pdfinfo = $this->getCommandLine('pdfinfo'); + $pdfinfo->setArg('f'); + $pdfinfo->setArg(null, $this->in); + $pdfinfo->execute(); + + $this->addToLog($pdfinfo); + + /* + * /home/extranet/www/fluidbook/docs/100376/original.pdf: + + PDF-1.7 + Info object (9177 0 R): + << + /CreationDate (D:20110831090005+02'00') + /Creator (Adobe InDesign CS5 \(7.0\)) + /ModDate (D:20110901144502+02'00') + /Producer (Adobe PDF Library 9.9) + /Trapped /False + >> + + Pages: 116 + + Retrieving info from pages 1-116... + Fonts (201): + 1 ( 9180 0 R): Type1 'QGNYEE+AvantGarde-Book' (9193 0 R) + 1 ( 9180 0 R): Type1 'QGNYEE+AvantGarde-ExtraLight' (9195 0 R) + 2 ( 1 0 R): Type1 'QGNYEE+DIN-Light' (7605 0 R) + 2 ( 1 0 R): Type1 'QGNYEE+AvantGarde-ExtraLightObl' (7608 0 R) + 2 ( 1 0 R): Type1 'QGNYEE+AvantGarde-MediumObl' (7601 0 R) + + */ + + if (preg_match_all("|\s+\d+\s+\(\s+\d+\s+\d+\s+\w+\):\s+\w+\s+'([A-Za-z-+]+)'\s+\((\d+)\s+\d+\s+\w+\)\s|", $pdfinfo->output, $matches)) { + foreach ($matches[1] as $k => $name) { + $this->fonts[$matches[2][$k]] = array('name' => $name); + } + } + } + + protected function getFontsDescriptors() { + $pdfshow = $this->getCommandLine('pdfshow'); + $pdfshow->setArg(null, $this->in); + $pdfshow->setManualArg(implode(' ', array_keys($this->fonts))); + $pdfshow->execute(); + //$this->doc->addToLog($pdfshow); + + /* + 8677 0 obj + << + /BaseFont /YIKUMO+Futura-Condensed + /Encoding /WinAnsiEncoding + /FirstChar 32 + /FontDescriptor 7690 0 R + /LastChar 119 + /Subtype /Type1 + /Type /Font + /Widths [ 205 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 414 418 0 466 0 342 0 0 0 303 0 0 561 0 + 0 388 0 0 354 0 0 0 0 0 0 0 0 0 0 0 0 0 380 380 0 380 361 + 0 382 388 180 0 395 180 590 388 375 380 0 269 276 221 389 + 0 487 ] + >> + endobj + + 9193 0 obj + << + /BaseFont /QGNYEE+AvantGarde-Book + /Encoding /WinAnsiEncoding + /FirstChar 32 + /FontDescriptor 9192 0 R + /LastChar 201 + /Subtype /Type1 + /ToUnicode 9185 0 R + /Type /Font + /Widths [ 277 0 0 0 0 0 0 0 0 0 0 0 277 332 0 0 554 554 554 + 0 0 0 554 554 0 0 0 0 0 0 0 0 0 740 574 813 744 536 485 872 + 683 226 482 0 462 919 740 869 592 0 607 498 426 655 702 0 + 0 592 480 0 0 0 0 0 0 683 0 0 0 650 0 0 0 200 0 0 200 0 610 + 655 0 0 301 388 339 0 554 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1000 + 0 0 0 0 0 0 0 0 0 0 0 0 351 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 740 0 0 0 0 0 0 0 0 536 ] + >> + endobj + */ + + $lines = explode("\n", $pdfshow->output); + $currentFont = null; + foreach ($lines as $line) { + $line = trim($line); + if ($line == '' || $line == '<<' || $line == '>>') { + continue; + } + if ($line == 'endobj') { + $currentFont = null; + continue; + } + $e = explode(' ', $line); + if (count($e)==3 && $e[2] == 'obj') { + $currentFont = $e[0]; + continue; + } + + if (is_null($currentFont)) { + continue; + } + + $fontname = $this->fonts[$currentFont]['name']; + + if ($e[0] == '/ToUnicode') { + $this->fonts[$currentFont]['cmap'] = $e[1]; + $this->cmaps[$fontname] = $e[1]; + } + } + + $this->addToLog(print_r($this->fonts, true)); + } + + protected function extractFonts() { + $descriptors = $this->getUniqueId('descriptor'); + + $pdfextract = $this->getCommandLine('pdfextract'); + $pdfextract->setArg(null, $this->in); + $pdfextract->execute(); + $this->addToLog($pdfextract); + + + $formats = array('pnm', 'pam', 'pgm', 'jpg', 'gif', 'jpeg', 'png'); + foreach ($formats as $f) { + `rm $this->outpdf/*.$f`; + } + } + + protected function extractCmaps() { + $cmaps = array_unique($this->cmaps); + + $pdfshow = $this->getCommandLine('pdfshow'); + $pdfshow->setArg(null, $this->in); + $pdfshow->setManualArg(implode(' ', $cmaps)); + $pdfshow->execute(); + $this->addToLog($pdfshow); + + + /* 5266 0 obj + << + /Filter /FlateDecode + /Length 583 + >> + stream + /CIDInit /ProcSet findresource begin + 12 dict begin + begincmap + /CIDSystemInfo + << /Registry (Adobe) + /Ordering (UCS) /Supplement 0 >> def + /CMapName /Adobe-Identity-UCS def + /CMapType 2 def + 1 begincodespacerange + <0000> + endcodespacerange + 81 beginbfchar + <0000> <0020> + <0001> <0021> + <0005> <0025> + <0007> <2019> + <0008> <0028> + <0009> <0029> + <000C> <002C> + <000D> <002D> + <000E> <002E> + <000F> <002F> + endbfchar + endcmap CMapName currentdict /CMap defineresource pop end end + endstream + endobj + + */ + + $finalCmaps = array(); + + $lines = explode("\n", $pdfshow->output); + $currentCmap = null; + $inMap = false; + foreach ($lines as $line) { + $line = trim($line); + if ($line == '') { + continue; + } + if ($line == 'endobj') { + $currentCmap = null; + continue; + } elseif ($line == 'endbfchar') { + $inMap = false; + } + + $e = explode(' ', $line); + if (count($e)==3 && $e[2] == 'obj') { + $currentCmap = $e[0]; + continue; + } + + if (count($e)==1 && $e[1] == 'beginbfchar') { + $inMap = true; + continue; + } + + if (!$inMap || is_null($currentCmap)) { + continue; + } + + $fonts = array_keys($this->cmaps, $currentCmap); + + $from = trim($e[0], '<>'); + $to = trim($e[1], '<>'); + + foreach ($fonts as $fontname) { + if (!isset($finalCmaps[$fontname])) { + $finalCmaps[$fontname] = array(); + } + + $finalCmaps[$fontname][$from] = $to; + } + } + + foreach ($finalCmaps as $fontname => $cmap) { + arsort($cmap); + $data = ''; + foreach ($cmap as $from => $to) { + $data.=$from . "\t" . $to . "\n"; + } + file_put_contents($this->outpdf . '/' . $fontname . '.cmap', $data); + } + } + + protected function getUniqueId($param) { + $res = array(); + foreach ($this->fonts as $f) { + if (isset($f[$param])) { + $res[$id] = $f[$param]; + } + } + return array_unique($res); + } + + protected function clean() { + `rm -rf $this->outpdf`; + `rm -rf $this->outweb`; + mkdir($this->outweb, 0777, true); + mkdir($this->outpdf, 0777, true); + } + + protected function getCommandLine($program) { + $res = new cubeCommandLine('/usr/local/mupdf/' . $program); + $res->cd($this->outpdf); + $res->setPath(CONVERTER_PATH); + return $res; + } + + protected function addToLog($tolog) { + if (!is_null($this->doc)) { + $this->doc->addToLog($tolog); + } elseif ($tolog instanceof cubeCommandLine) { + echo $tolog->commande . "\n\n"; + echo $tolog->output . "\n\n"; + } else { + echo $tolog . "\n\n"; + } + } + +} + +?>