From: Vincent Vanwaelscappel Date: Thu, 2 Mar 2023 16:43:11 +0000 (+0100) Subject: wip #5770 @0.25 X-Git-Url: http://git.cubedesigners.com/?a=commitdiff_plain;h=97b5e96140945c11230ab5dc48219e30d8a6f3c2;p=cubist_pdf.git wip #5770 @0.25 --- diff --git a/src/PDFTools.php b/src/PDFTools.php index aa97a4b..f4a2f4a 100644 --- a/src/PDFTools.php +++ b/src/PDFTools.php @@ -12,536 +12,520 @@ use DOMNode; use DOMXPath; use Cubist\PDF\CommandLine\FWSTK; -class PDFTools -{ - /** - * @param $path string - * @return string - */ - - public static function resource_path($path) - { - return __DIR__ . '/../resources/' . self::_cleanPath($path); - } - - /** - * @param $path string - * @return string - */ - public static function tools_path($path, $chmod = false) - { - $res = self::resource_path('tools/' . self::_cleanPath($path)); - if ($chmod) { - self::chmodExec($res); - } - return $res; - } - - public static function chmodExec($path) - { - if (is_file($path)) { - @chmod($path, 0755); - } - } - - protected static function parseInfos($data) - { - $res = []; - - // This function get general infos (pages sizes, boxes, number sections and - // bookmarks - // Init arrays - $res['infos'] = []; - $res['infos']['size'] = [0, 0]; - $res['bookmarks'] = []; - $res['numberSections'] = ''; - $bookmark_id = 0; - - $res['size'] = array(0, 0); - $lines = explode("\n", $data); - foreach ($lines as $line) { - $line = trim(Text::condenseWhite($line)); - $e = explode(':', $line, 2); - $k = trim($e[0]); - if (count($e) < 2) { - continue; - } - $v = trim($e[1]); - if ($k == 'Pages' || $k == 'NumberOfPages') { - $res['pages'] = $res['infos']['pages'] = $v; - $res['infos']['page'] = []; - for ($i = 1; $i <= $res['pages']; $i++) { - $res['infos']['page'][$i] = []; - } - } elseif (preg_match('|Page\s+([0-9]+)\s+(.*)Box:\s+([0-9.]*)\s+([0-9.]*)\s+([0-9.]*)\s+([0-9.]*)|iu', $line, $m)) { - $res['infos']['page'][$m[1]][strtolower($m[2])] = [$m[3], $m[4], $m[5], $m[6]]; - } elseif (preg_match('|Page\s+([0-9]+)\s+size:\s+([0-9.]*)[pts[:space:]]+x\s+([0-9.]*)\s+pts|iu', $line, $m)) { - $res['infos']['page'][$m[1]]['size'] = array($m[2], $m[3]); - if ($m[1] == 1) { - $res['infos']['size'][0] = $m[2]; - $res['infos']['size'][1] = $m[3]; - } - } elseif ($k == 'BookmarkTitle') { - $res['bookmarks'][$bookmark_id] = array('titre' => str_replace(' ', '', trim($v))); - } elseif ($k == 'BookmarkLevel') { - $res['bookmarks'][$bookmark_id]['level'] = $v; - } elseif ($k == 'BookmarkPage') { - $res['bookmarks'][$bookmark_id]['page'] = $v; - $bookmark_id++; - } elseif ($k == 'NumberSections') { - $res['numberSections'] = $v; - $res['infos']['pagenumbers'] = $v; - } - } - return $res; - } - - public static function infos($pdf) - { - $fwstk = new FWSTK(); - $fwstk->setArg('--input ' . $pdf); - $fwstk->setArg('--infos'); - $fwstk->execute(); - $out = $fwstk->getOutput(); - - $pdfinfo = new CommandLine('pdfinfo'); - $pdfinfo->setArg('-box'); - $pdfinfo->setArg('f', 1); - $pdfinfo->setArg('l', 100000); - $pdfinfo->setArg(null, $pdf); - $pdfinfo->execute(); - $out .= "\n"; - $out .= $pdfinfo->getOutput(); - - return self::parseInfos($out); - } - - /** - * @param $path string - * @return string - */ - protected static function _cleanPath($path) - { - return trim($path, '/'); - } - - public static function makeMiniShot($in, $out, $page, $format = 'jpg') - { - self::makeShotFixedWidth($in, $out, $page, 'p', 500, 65, 4, 'PNM', $format); - } - - public static function makeShotFixedWidth($in, $out, $page, $prefix = '', $w = 100, $quality = 90, $antialiasing = 4, $method = 'PNM', $format = 'jpg') - { - // Make thumbs of $w width - self::makeShot($in, $out, $page, $prefix, null, $quality, $antialiasing, $method, $w, -1, $format); - } - - public static function makeShotFixedHeight($in, $out, $page, $prefix = '', $h = '', $quality = 90, $antialiasing = 4, $method = 'PNM', $format = 'jpg') - { - // Make thumbs of $h height - self::makeShot($in, $out, $page, $prefix, null, $quality, $antialiasing, $method, -1, $h, $format); - } - - public static function makeSWF($in, $out, $page, $resolution = 100, $quality = 90) - { - if (file_exists($out)) { - unlink($out); - } - $pdf2swf = new CommandLine('pdf2swf', null, true); - $pdf2swf->setArg('p', $page); - $pdf2swf->setArg('T', 10); - $pdf2swf->setArg('Q', 30); - $pdf2swf->setArg('set reordertags', '0'); - $pdf2swf->setArg('fonts'); - $pdf2swf->setArg('set storeallcharacters'); - $pdf2swf->setArg('set subpixels', $resolution / 72); - $pdf2swf->setArg('set jpegquality', $quality); - $pdf2swf->setArg('set disablelinks'); - $pdf2swf->setArg('set dots'); - $pdf2swf->setArg(null, $in); - $pdf2swf->setArg('output', $out); - $pdf2swf->execute(); - $pdf2swf->debug(); - - if (file_exists($out)) { - return; - } - $pdf2swf = new CommandLine('pdf2swf', null, true); - $pdf2swf->setArg('p', $page); - $pdf2swf->setArg('T', 10); - $pdf2swf->setArg('Q', 120); - $pdf2swf->setArg('set poly2bitmap'); - $pdf2swf->setArg('set storeallcharacters'); - $pdf2swf->setArg('set reordertags', '0'); - $pdf2swf->setArg('fonts'); - $pdf2swf->setArg('set subpixels', $resolution / 72); - $pdf2swf->setArg('set jpegquality', $quality); - $pdf2swf->setArg('set disablelinks'); - $pdf2swf->setArg('set dots'); - $pdf2swf->setArg(null, $in); - $pdf2swf->setArg('output', $out); - $pdf2swf->execute(); - $pdf2swf->debug(); - if (file_exists($out)) { - return; - } - $pdf2swf = new CommandLine('pdf2swf', null, true); - $pdf2swf->setArg('p', $page); - $pdf2swf->setArg('T', 10); - $pdf2swf->setArg('set reordertags', '0'); - $pdf2swf->setArg('fonts'); - $pdf2swf->setArg('set bitmap'); - $pdf2swf->setArg('set storeallcharacters'); - $pdf2swf->setArg('set subpixels', $resolution / 72); - $pdf2swf->setArg('set jpegquality', $quality); - $pdf2swf->setArg('set disablelinks'); - $pdf2swf->setArg('set dots'); - $pdf2swf->setArg(null, $in); - $pdf2swf->setArg('output', $out); - $pdf2swf->execute(); - $pdf2swf->debug(); - } - - - public static function makeBaseSVGFile($in, $out, $page) - { - $pdftocairo = new CommandLine('pdftocairo'); - $pdftocairo->setArg('f', $page); - $pdftocairo->setArg('l', $page); - $pdftocairo->setArg('r', 300); - $pdftocairo->setArg(null, '-expand'); - $pdftocairo->setArg(null, '-svg'); - $pdftocairo->setArg(null, $in); - $pdftocairo->setArg(null, $out); - $pdftocairo->execute(); - } - - public static function makeTextSVGFile($in, $out) - { - $svg = new DOMDocument(); - $svg->preserveWhiteSpace = false; - $svg->load($in, LIBXML_PARSEHUGE); - - // Operations to delete - $xpath = new DOMXPath($svg); - $xpath->registerNamespace('svg', 'http://www.w3.org/2000/svg'); - $xpath->registerNamespace('xlink', 'http://www.w3.org/1999/xlink'); - $xpath->registerNamespace("php", "http://php.net/xpath"); - $toDelete = [ - '//svg:defs/svg:g[starts-with(@id, "surface")]//svg:path', - '//svg:defs/svg:g[starts-with(@id, "surface")]//svg:rect', - '//svg:defs/svg:g[starts-with(@id, "surface")]//svg:use[starts-with(@xlink:href, "#image")]', - '/svg:svg/svg:g[@id="surface1"]//svg:path', - '/svg:svg/svg:g[@id="surface1"]//svg:rect', - '/svg:svg/svg:g[@id="surface1"]//svg:filter', - '/svg:svg/svg:g[@id="surface1"]//svg:use[starts-with(@xlink:href, "#image")]', - '//svg:svg/svg:g[@id="surface1"]//svg:use[starts-with(@xlink:href, "#image")]', - ]; - $toDeleteIfOrphan = [ - '//svg:image', - ]; - - foreach ($toDelete as $q) { - $list = $xpath->query($q); - if (count($list)) { - foreach ($list as $node) { - /* @var $node DOMNode */ - $parent = $node->parentNode; - $parent->removeChild($node); - } - } - } - - foreach ($toDeleteIfOrphan as $q) { - $list = $xpath->query($q); - if (count($list)) { - foreach ($list as $node) { - /* @var $node DOMElement */ - $id = $node->getAttribute('id'); - if ($xpath->query('//*[@id="' . $id . '"]')->count() > 0) { - $parent = $node->parentNode; - $parent->removeChild($node); - } - } - } - } - $res = $svg->saveXML(); - $res = preg_replace('//', '', $res); - while (true) { - $res = preg_replace('/<\/g>/', '', $res, -1, $count); - if (!$count) { - break; - } - } - - file_put_contents($out, $res); - } - - public static function makeShot($in, $out, $page, $prefix = '', $resolution = 72, $quality = 90, $antialiasing = 4, $method = 'PNM', $width = null, $height = null, $format = 'jpg') - { - $error = false; - if ($method === 'GS') { - self::makeShotGS($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, $width, $height, $format); - } elseif ($method === 'PNM') { - self::makeShotPNM($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, true, $width, $height, $format); - } - // Test the result by checking all files - if (!file_exists($out)) { - $error = true; - } - // If error, we try to make thumbs with other method - if ($error) { - if ($method === 'GS') { - self::makeShotPNM($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, true, $width, $height, $format); - } elseif ($method === 'PNM') { - self::makeShotGS($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, $width, $height, $format); - } - } - } - - protected static function makeShotGS($in, $out, $page, $prefix = '', $resolution = 72, $quality = 90, $antialiasing = 4, $width = null, $height = null, $format = 'jpg') - { - // Fabrication des thumbnails avec ghostscript - $gs = new CommandLine('gs', null, true); - $gs->setArg('-dBATCH'); - $gs->setArg('-dNOPAUSE'); - $gs->setArg('-dNOPROMPT'); - // Antialias - $gs->setArg('-dDOINTERPOLATE'); - $gs->setArg('-dTextAlphaBits=' . $antialiasing); - $gs->setArg('-dGraphicsAlphaBits=' . $antialiasing); - // Device - $device = $format === 'jpg' ? 'jpeg' : 'png16m'; - $gs->setArg('-sDEVICE=' . $device); - // Dispotion & colors - // $gs->setArg('-dUseCIEColor'); - $gs->setArg('-dAutoRotatePages=/None'); - $gs->setArg('-dUseCropBox'); - // Resolution & Quality - $gs->setArg('-r' . round($resolution)); - if ($format === 'jpg') { - $gs->setArg('-dJPEGQ=' . $quality); - } - // Performances - $gs->setArg('-dNumRenderingThreads=4'); - // Page range - $gs->setArg('-dFirstPage=' . $page); - $gs->setArg('-dLastPage=' . $page); - // Files - $gs->setArg('-sOutputFile=' . $out); - - $gs->setArg(null, $in); - $gs->execute(); - } - - public static function makeShotPNM($in, $out, $page, $prefix = '', $resolution = 72, $quality = 90, $antialiasing = 4, $texts = true, $width = null, $height = null, $format = 'jpg') - { - $tmp = Files::tempnam(); - - $antialiasing = $antialiasing ? 'yes' : 'no'; - $freetype = $texts ? 'yes' : 'no'; - // Exporte les fichiers - $pdftoppm = new CommandLine('pdftoppm', null, true); - $pdftoppm->setArg('f', $page); - $pdftoppm->setArg('l', $page); - $pdftoppm->setArg('-cropbox'); - $pdftoppm->setArg('-freetype ' . $freetype); - $pdftoppm->setArg('-singlefile'); - $pdftoppm->setArg('-aa ' . $antialiasing); - $pdftoppm->setArg('-aaVector ' . $antialiasing); - if (null !== $resolution) { - $pdftoppm->setArg('r', $resolution); - } - if (null !== $width) { - $pdftoppm->setArg('-scale-to-x ' . $width); - } - if (null !== $height) { - $pdftoppm->setArg('-scale-to-y ' . $height); - } - $pdftoppm->setArg(null, $in); - $pdftoppm->setArg(null, $tmp); - $pdftoppm->execute(); - $tmp .= '.ppm'; - - - if (file_exists($tmp)) { - if ($format === 'jpg') { - $cjpeg = new CommandLine('cjpeg', null, true); - $cjpeg->setArg('-quality ' . ($quality + 6)); - $cjpeg->setArg('-outfile ' . $out); - $cjpeg->setArg(null, $tmp); - $cjpeg->execute(); - } else if ($format === 'png') { - $pnmtopng = new CommandLine('pnmtopng', $out, false); - $pnmtopng->setArg('-background white'); - $pnmtopng->setArg(null, $tmp); - $pnmtopng->execute(); - } - unlink($tmp); - } else { - $pdftoppm->debug(); - } - } - - public static function getThumbFromPDF($pdf, $page, $format = 'jpg') - { - if (!file_exists($pdf)) { - return false; - } - $dir = WS_CACHE . '/thumbs/' . sha1($pdf) . '/'; - if (!file_exists($dir)) { - mkdir($dir, 0777, true); - } - $image = $dir . '/p' . $page . '.' . $format; - $mtime = filemtime($image); - - if (!file_exists($image) || $mtime < filemtime(__FILE__) || $mtime < filemtime($pdf)) { - self::makeMiniShot($pdf, $image, $page, $format); - } - - return $image; - } - - public static function extractLinks($pdf, $out) - { - $out .= 'links/'; - Files::mkdir($out); - - if (file_exists($out . '/p1.csv')) { - return; - } - $fwstk = new FWSTK(); - $fwstk->setArg('--input ' . $pdf); - $fwstk->setArg('--extractLinks ' . $out . 'p%d.csv'); - $fwstk->setArg('--threads 1'); - $fwstk->execute(); - } - - public static function extractTexts($pdf, $out, $textExtraction = 'fluidbook', $ignoreSeparators = '') - { - $out .= 'texts'; - if ($ignoreSeparators) { - $out .= '/sep_' . md5($ignoreSeparators); - } - $out = Files::mkdir($out); - - $fwstk = new FWSTK(); - $fwstk->setArg('--input ' . $pdf); - $fwstk->setArg('--extractTexts ' . $out . '%s%d.txt'); - $fwstk->setArg('--extractTextsMethod ' . $textExtraction); - $fwstk->setArg('--threads 1'); - if ($ignoreSeparators) { - $fwstk->setArg('--ignoreSeparators ' . $ignoreSeparators); - } - $fwstk->execute(); - } - - - public static function extractHighlightsData($pdf, $out) - { - $out .= 'texts/'; - Files::mkdir($out); - - $fwstk = new FWSTK(); - $fwstk->setArg('--input ' . $pdf); - $fwstk->setArg('--layout ' . $out . 'p%d.fby'); - $fwstk->setArg('--cmaps ' . $out); - $fwstk->setArg('--fonts' . $out . 'fonts/web/'); - $fwstk->execute(); - } - - public static function fixPDF($in, $out) - { - if (file_exists($out)) { - unlink($out); - } - - $pdftk = new CommandLine('pdftk'); - $pdftk->setArg(null, $in); - $pdftk->setArg(null, 'output'); - $pdftk->setArg(null, $out); - $pdftk->execute(); - - if (!file_exists($out)) { - $pdftocairo = new CommandLine('pdftocairo'); - $pdftocairo->setPath(CONVERTER_PATH); - $pdftocairo->setArg(null, '-pdf'); - $pdftocairo->setArg(null, $in); - $pdftocairo->setArg(null, $out); - $pdftocairo->execute(); - } - } - - public static function split($pdf, $out) - { - - $lock = $pdf . '.split.lock'; - - $returnAfterSleep = false; - - usleep(rand(100000, 2000000)); - - while (file_exists($lock)) { - if (filemtime($lock) < time() - 300) { - unlink($lock); - } - $returnAfterSleep = true; - sleep(5); - } - if ($returnAfterSleep) { - return; - } - - touch($lock); - - try { - Files::mkdir($out); - $pdftk = new CommandLine('pdftk'); - $pdftk->setArg(null, $pdf); - $pdftk->setArg(null, 'burst'); - $pdftk->setArg(null, 'uncompress'); - $pdftk->setArg(null, 'output'); - $pdftk->setArg(null, $out . '/p%d.pdf'); - $pdftk->execute(); - - - for ($i = 1; true; $i++) { - // Remove annotations : https://gist.github.com/stefanschmidt/5248592 - $file = sprintf($out . '/p%d.pdf', $i); - if (!file_exists($file)) { - break; - } - $to = sprintf($out . '/s%d.pdf', $i); - `LANG=C LC_CTYPE=C sed -n '/^\/Annots/!p' $file > $to`; - if (file_exists($to)) { - if (filesize($to) > 0) { - unlink($file); - rename($to, $file); - } else { - unlink($to); - } - } - } - } catch (\Exception $e) { - - } - unlink($lock); - } - - public static function compressPDF($source, $dest, $resolution = 72) - { - $gs = new CommandLine('gs'); - $gs->setArg('-dBATCH'); - $gs->setArg('-dNOPAUSE'); - $gs->setArg('-dNOPROMPT'); - $gs->setArg('-sOutputFile=' . $dest); - $gs->setArg('-sDEVICE=pdfwrite'); - $gs->setArg('-dPDFSETTINGS=/ebook'); - $gs->setArg('-dColorImageResolution=' . $resolution); - $gs->setArg('-dAutoRotatePages=/None'); - $gs->setArg('-dColorConversionStrategy=/LeaveColorUnchanged'); - $gs->setArg(null, $source); - $gs->execute(); - } +class PDFTools { + /** + * @param $path string + * @return string + */ + + public static function resource_path($path) { + return __DIR__ . '/../resources/' . self::_cleanPath($path); + } + + /** + * @param $path string + * @return string + */ + public static function tools_path($path, $chmod = false) { + $res = self::resource_path('tools/' . self::_cleanPath($path)); + if ($chmod) { + self::chmodExec($res); + } + return $res; + } + + public static function chmodExec($path) { + if (is_file($path)) { + @chmod($path, 0755); + } + } + + protected static function parseInfos($data) { + $res = []; + + // This function get general infos (pages sizes, boxes, number sections and + // bookmarks + // Init arrays + $res['raw'] = $data; + $res['infos'] = []; + $res['infos']['size'] = [0, 0]; + $res['bookmarks'] = []; + $res['numberSections'] = ''; + $bookmark_id = 0; + + $res['size'] = array(0, 0); + $lines = explode("\n", $data); + foreach ($lines as $line) { + $line = trim(Text::condenseWhite($line)); + $e = explode(':', $line, 2); + $k = trim($e[0]); + if (count($e) < 2) { + continue; + } + $v = trim($e[1]); + if ($k == 'Pages' || $k == 'NumberOfPages') { + $res['pages'] = $res['infos']['pages'] = $v; + $res['infos']['page'] = []; + for ($i = 1; $i <= $res['pages']; $i++) { + $res['infos']['page'][$i] = []; + } + } elseif (preg_match('|Page\s+([0-9]+)\s+(.*)Box:\s+([0-9.]*)\s+([0-9.]*)\s+([0-9.]*)\s+([0-9.]*)|iu', $line, $m)) { + $res['infos']['page'][$m[1]][strtolower($m[2])] = [$m[3], $m[4], $m[5], $m[6]]; + } elseif (preg_match('|Page\s+([0-9]+)\s+size:\s+([0-9.]*)[pts[:space:]]+x\s+([0-9.]*)\s+pts|iu', $line, $m)) { + $res['infos']['page'][$m[1]]['size'] = array($m[2], $m[3]); + if ($m[1] == 1) { + $res['infos']['size'][0] = $m[2]; + $res['infos']['size'][1] = $m[3]; + } + } elseif ($k == 'BookmarkTitle') { + $res['bookmarks'][$bookmark_id] = array('titre' => str_replace(' ', '', trim($v))); + } elseif ($k == 'BookmarkLevel') { + $res['bookmarks'][$bookmark_id]['level'] = $v; + } elseif ($k == 'BookmarkPage') { + $res['bookmarks'][$bookmark_id]['page'] = $v; + $bookmark_id++; + } elseif ($k == 'NumberSections') { + $res['numberSections'] = $v; + $res['infos']['pagenumbers'] = $v; + } + } + return $res; + } + + /** + * @throws \Exception + */ + public static function infos($pdf) { + if (!file_exists($pdf)) { + throw new \Exception('Unable to parse infos of ' . $pdf . ' : file not found'); + } + $fwstk = new FWSTK(); + $fwstk->setArg('--input ' . $pdf); + $fwstk->setArg('--infos'); + $fwstk->execute(); + $out = $fwstk->getOutput(); + + $pdfinfo = new CommandLine('pdfinfo'); + $pdfinfo->setArg('-box'); + $pdfinfo->setArg('f', 1); + $pdfinfo->setArg('l', 100000); + $pdfinfo->setArg(null, $pdf); + $pdfinfo->execute(); + $out .= "\n"; + $out .= $pdfinfo->getOutput(); + + return self::parseInfos($out); + } + + /** + * @param $path string + * @return string + */ + protected static function _cleanPath($path) { + return trim($path, '/'); + } + + public static function makeMiniShot($in, $out, $page, $format = 'jpg') { + self::makeShotFixedWidth($in, $out, $page, 'p', 500, 65, 4, 'PNM', $format); + } + + public static function makeShotFixedWidth($in, $out, $page, $prefix = '', $w = 100, $quality = 90, $antialiasing = 4, $method = 'PNM', $format = 'jpg') { + // Make thumbs of $w width + self::makeShot($in, $out, $page, $prefix, null, $quality, $antialiasing, $method, $w, -1, $format); + } + + public static function makeShotFixedHeight($in, $out, $page, $prefix = '', $h = '', $quality = 90, $antialiasing = 4, $method = 'PNM', $format = 'jpg') { + // Make thumbs of $h height + self::makeShot($in, $out, $page, $prefix, null, $quality, $antialiasing, $method, -1, $h, $format); + } + + public static function makeSWF($in, $out, $page, $resolution = 100, $quality = 90) { + if (file_exists($out)) { + unlink($out); + } + $pdf2swf = new CommandLine('pdf2swf', null, true); + $pdf2swf->setArg('p', $page); + $pdf2swf->setArg('T', 10); + $pdf2swf->setArg('Q', 30); + $pdf2swf->setArg('set reordertags', '0'); + $pdf2swf->setArg('fonts'); + $pdf2swf->setArg('set storeallcharacters'); + $pdf2swf->setArg('set subpixels', $resolution / 72); + $pdf2swf->setArg('set jpegquality', $quality); + $pdf2swf->setArg('set disablelinks'); + $pdf2swf->setArg('set dots'); + $pdf2swf->setArg(null, $in); + $pdf2swf->setArg('output', $out); + $pdf2swf->execute(); + $pdf2swf->debug(); + + if (file_exists($out)) { + return; + } + $pdf2swf = new CommandLine('pdf2swf', null, true); + $pdf2swf->setArg('p', $page); + $pdf2swf->setArg('T', 10); + $pdf2swf->setArg('Q', 120); + $pdf2swf->setArg('set poly2bitmap'); + $pdf2swf->setArg('set storeallcharacters'); + $pdf2swf->setArg('set reordertags', '0'); + $pdf2swf->setArg('fonts'); + $pdf2swf->setArg('set subpixels', $resolution / 72); + $pdf2swf->setArg('set jpegquality', $quality); + $pdf2swf->setArg('set disablelinks'); + $pdf2swf->setArg('set dots'); + $pdf2swf->setArg(null, $in); + $pdf2swf->setArg('output', $out); + $pdf2swf->execute(); + $pdf2swf->debug(); + if (file_exists($out)) { + return; + } + $pdf2swf = new CommandLine('pdf2swf', null, true); + $pdf2swf->setArg('p', $page); + $pdf2swf->setArg('T', 10); + $pdf2swf->setArg('set reordertags', '0'); + $pdf2swf->setArg('fonts'); + $pdf2swf->setArg('set bitmap'); + $pdf2swf->setArg('set storeallcharacters'); + $pdf2swf->setArg('set subpixels', $resolution / 72); + $pdf2swf->setArg('set jpegquality', $quality); + $pdf2swf->setArg('set disablelinks'); + $pdf2swf->setArg('set dots'); + $pdf2swf->setArg(null, $in); + $pdf2swf->setArg('output', $out); + $pdf2swf->execute(); + $pdf2swf->debug(); + } + + + public static function makeBaseSVGFile($in, $out, $page) { + $pdftocairo = new CommandLine('pdftocairo'); + $pdftocairo->setArg('f', $page); + $pdftocairo->setArg('l', $page); + $pdftocairo->setArg('r', 300); + $pdftocairo->setArg(null, '-expand'); + $pdftocairo->setArg(null, '-svg'); + $pdftocairo->setArg(null, $in); + $pdftocairo->setArg(null, $out); + $pdftocairo->execute(); + } + + public static function makeTextSVGFile($in, $out) { + $svg = new DOMDocument(); + $svg->preserveWhiteSpace = false; + $svg->load($in, LIBXML_PARSEHUGE); + + // Operations to delete + $xpath = new DOMXPath($svg); + $xpath->registerNamespace('svg', 'http://www.w3.org/2000/svg'); + $xpath->registerNamespace('xlink', 'http://www.w3.org/1999/xlink'); + $xpath->registerNamespace("php", "http://php.net/xpath"); + $toDelete = [ + '//svg:defs/svg:g[starts-with(@id, "surface")]//svg:path', + '//svg:defs/svg:g[starts-with(@id, "surface")]//svg:rect', + '//svg:defs/svg:g[starts-with(@id, "surface")]//svg:use[starts-with(@xlink:href, "#image")]', + '/svg:svg/svg:g[@id="surface1"]//svg:path', + '/svg:svg/svg:g[@id="surface1"]//svg:rect', + '/svg:svg/svg:g[@id="surface1"]//svg:filter', + '/svg:svg/svg:g[@id="surface1"]//svg:use[starts-with(@xlink:href, "#image")]', + '//svg:svg/svg:g[@id="surface1"]//svg:use[starts-with(@xlink:href, "#image")]', + ]; + $toDeleteIfOrphan = [ + '//svg:image', + ]; + + foreach ($toDelete as $q) { + $list = $xpath->query($q); + if (count($list)) { + foreach ($list as $node) { + /* @var $node DOMNode */ + $parent = $node->parentNode; + $parent->removeChild($node); + } + } + } + + foreach ($toDeleteIfOrphan as $q) { + $list = $xpath->query($q); + if (count($list)) { + foreach ($list as $node) { + /* @var $node DOMElement */ + $id = $node->getAttribute('id'); + if ($xpath->query('//*[@id="' . $id . '"]')->count() > 0) { + $parent = $node->parentNode; + $parent->removeChild($node); + } + } + } + } + $res = $svg->saveXML(); + $res = preg_replace('//', '', $res); + while (true) { + $res = preg_replace('/<\/g>/', '', $res, -1, $count); + if (!$count) { + break; + } + } + + file_put_contents($out, $res); + } + + public static function makeShot($in, $out, $page, $prefix = '', $resolution = 72, $quality = 90, $antialiasing = 4, $method = 'PNM', $width = null, $height = null, $format = 'jpg') { + $error = false; + if ($method === 'GS') { + self::makeShotGS($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, $width, $height, $format); + } elseif ($method === 'PNM') { + self::makeShotPNM($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, true, $width, $height, $format); + } + // Test the result by checking all files + if (!file_exists($out)) { + $error = true; + } + // If error, we try to make thumbs with other method + if ($error) { + if ($method === 'GS') { + self::makeShotPNM($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, true, $width, $height, $format); + } elseif ($method === 'PNM') { + self::makeShotGS($in, $out, $page, $prefix, $resolution, $quality, $antialiasing, $width, $height, $format); + } + } + } + + protected static function makeShotGS($in, $out, $page, $prefix = '', $resolution = 72, $quality = 90, $antialiasing = 4, $width = null, $height = null, $format = 'jpg') { + // Fabrication des thumbnails avec ghostscript + $gs = new CommandLine('gs', null, true); + $gs->setArg('-dBATCH'); + $gs->setArg('-dNOPAUSE'); + $gs->setArg('-dNOPROMPT'); + // Antialias + $gs->setArg('-dDOINTERPOLATE'); + $gs->setArg('-dTextAlphaBits=' . $antialiasing); + $gs->setArg('-dGraphicsAlphaBits=' . $antialiasing); + // Device + $device = $format === 'jpg' ? 'jpeg' : 'png16m'; + $gs->setArg('-sDEVICE=' . $device); + // Dispotion & colors + // $gs->setArg('-dUseCIEColor'); + $gs->setArg('-dAutoRotatePages=/None'); + $gs->setArg('-dUseCropBox'); + // Resolution & Quality + $gs->setArg('-r' . round($resolution)); + if ($format === 'jpg') { + $gs->setArg('-dJPEGQ=' . $quality); + } + // Performances + $gs->setArg('-dNumRenderingThreads=4'); + // Page range + $gs->setArg('-dFirstPage=' . $page); + $gs->setArg('-dLastPage=' . $page); + // Files + $gs->setArg('-sOutputFile=' . $out); + + $gs->setArg(null, $in); + $gs->execute(); + } + + public static function makeShotPNM($in, $out, $page, $prefix = '', $resolution = 72, $quality = 90, $antialiasing = 4, $texts = true, $width = null, $height = null, $format = 'jpg') { + $tmp = Files::tempnam(); + + $antialiasing = $antialiasing ? 'yes' : 'no'; + $freetype = $texts ? 'yes' : 'no'; + // Exporte les fichiers + $pdftoppm = new CommandLine('pdftoppm', null, true); + $pdftoppm->setArg('f', $page); + $pdftoppm->setArg('l', $page); + $pdftoppm->setArg('-cropbox'); + $pdftoppm->setArg('-freetype ' . $freetype); + $pdftoppm->setArg('-singlefile'); + $pdftoppm->setArg('-aa ' . $antialiasing); + $pdftoppm->setArg('-aaVector ' . $antialiasing); + if (null !== $resolution) { + $pdftoppm->setArg('r', $resolution); + } + if (null !== $width) { + $pdftoppm->setArg('-scale-to-x ' . $width); + } + if (null !== $height) { + $pdftoppm->setArg('-scale-to-y ' . $height); + } + $pdftoppm->setArg(null, $in); + $pdftoppm->setArg(null, $tmp); + $pdftoppm->execute(); + $tmp .= '.ppm'; + + + if (file_exists($tmp)) { + if ($format === 'jpg') { + $cjpeg = new CommandLine('cjpeg', null, true); + $cjpeg->setArg('-quality ' . ($quality + 6)); + $cjpeg->setArg('-outfile ' . $out); + $cjpeg->setArg(null, $tmp); + $cjpeg->execute(); + } else if ($format === 'png') { + $pnmtopng = new CommandLine('pnmtopng', $out, false); + $pnmtopng->setArg('-background white'); + $pnmtopng->setArg(null, $tmp); + $pnmtopng->execute(); + } + unlink($tmp); + } else { + $pdftoppm->debug(); + } + } + + public static function getThumbFromPDF($pdf, $page, $format = 'jpg') { + if (!file_exists($pdf)) { + return false; + } + $dir = WS_CACHE . '/thumbs/' . sha1($pdf) . '/'; + if (!file_exists($dir)) { + mkdir($dir, 0777, true); + } + $image = $dir . '/p' . $page . '.' . $format; + $mtime = filemtime($image); + + if (!file_exists($image) || $mtime < filemtime(__FILE__) || $mtime < filemtime($pdf)) { + self::makeMiniShot($pdf, $image, $page, $format); + } + + return $image; + } + + public static function extractLinks($pdf, $out) { + $out .= 'links/'; + Files::mkdir($out); + + if (file_exists($out . '/p1.csv')) { + return; + } + $fwstk = new FWSTK(); + $fwstk->setArg('--input ' . $pdf); + $fwstk->setArg('--extractLinks ' . $out . 'p%d.csv'); + $fwstk->setArg('--threads 1'); + $fwstk->execute(); + } + + public static function extractTexts($pdf, $out, $textExtraction = 'fluidbook', $ignoreSeparators = '') { + $out .= 'texts'; + if ($ignoreSeparators) { + $out .= '/sep_' . md5($ignoreSeparators); + } + $out = Files::mkdir($out); + + $fwstk = new FWSTK(); + $fwstk->setArg('--input ' . $pdf); + $fwstk->setArg('--extractTexts ' . $out . '%s%d.txt'); + $fwstk->setArg('--extractTextsMethod ' . $textExtraction); + $fwstk->setArg('--threads 1'); + if ($ignoreSeparators) { + $fwstk->setArg('--ignoreSeparators ' . $ignoreSeparators); + } + $fwstk->execute(); + } + + + public static function extractHighlightsData($pdf, $out) { + $out .= 'texts/'; + Files::mkdir($out); + + $fwstk = new FWSTK(); + $fwstk->setArg('--input ' . $pdf); + $fwstk->setArg('--layout ' . $out . 'p%d.fby'); + $fwstk->setArg('--cmaps ' . $out); + $fwstk->setArg('--fonts' . $out . 'fonts/web/'); + $fwstk->execute(); + } + + public static function fixPDF($in, $out) { + if (file_exists($out)) { + unlink($out); + } + + $pdftk = new CommandLine('pdftk'); + $pdftk->setArg(null, $in); + $pdftk->setArg(null, 'output'); + $pdftk->setArg(null, $out); + $pdftk->execute(); + + if (!file_exists($out)) { + $pdftocairo = new CommandLine('pdftocairo'); + $pdftocairo->setPath(CONVERTER_PATH); + $pdftocairo->setArg(null, '-pdf'); + $pdftocairo->setArg(null, $in); + $pdftocairo->setArg(null, $out); + $pdftocairo->execute(); + } + } + + public static function split($pdf, $out) { + + $lock = $pdf . '.split.lock'; + + $returnAfterSleep = false; + + usleep(rand(100000, 2000000)); + + while (file_exists($lock)) { + if (filemtime($lock) < time() - 300) { + unlink($lock); + } + $returnAfterSleep = true; + sleep(5); + } + if ($returnAfterSleep) { + return; + } + + touch($lock); + + try { + Files::mkdir($out); + $pdftk = new CommandLine('pdftk'); + $pdftk->setArg(null, $pdf); + $pdftk->setArg(null, 'burst'); + $pdftk->setArg(null, 'uncompress'); + $pdftk->setArg(null, 'output'); + $pdftk->setArg(null, $out . '/p%d.pdf'); + $pdftk->execute(); + + + for ($i = 1; true; $i++) { + // Remove annotations : https://gist.github.com/stefanschmidt/5248592 + $file = sprintf($out . '/p%d.pdf', $i); + if (!file_exists($file)) { + break; + } + $to = sprintf($out . '/s%d.pdf', $i); + `LANG=C LC_CTYPE=C sed -n '/^\/Annots/!p' $file > $to`; + if (file_exists($to)) { + if (filesize($to) > 0) { + unlink($file); + rename($to, $file); + } else { + unlink($to); + } + } + } + } catch (\Exception $e) { + + } + unlink($lock); + } + + public static function compressPDF($source, $dest, $resolution = 72) { + $gs = new CommandLine('gs'); + $gs->setArg('-dBATCH'); + $gs->setArg('-dNOPAUSE'); + $gs->setArg('-dNOPROMPT'); + $gs->setArg('-sOutputFile=' . $dest); + $gs->setArg('-sDEVICE=pdfwrite'); + $gs->setArg('-dPDFSETTINGS=/ebook'); + $gs->setArg('-dColorImageResolution=' . $resolution); + $gs->setArg('-dAutoRotatePages=/None'); + $gs->setArg('-dColorConversionStrategy=/LeaveColorUnchanged'); + $gs->setArg(null, $source); + $gs->execute(); + } }