From: Vincent Vanwaelscappel Date: Tue, 23 Aug 2022 15:14:54 +0000 (+0200) Subject: wip #5411 @1.5 X-Git-Url: http://git.cubedesigners.com/?a=commitdiff_plain;h=88a6e9568fcf55ac0047aefc0b8ecfc4c88ff7fe;p=fluidbook-toolbox.git wip #5411 @1.5 --- diff --git a/app/Fluidbook/SearchIndex.php b/app/Fluidbook/SearchIndex.php new file mode 100644 index 000000000..f752a6ebc --- /dev/null +++ b/app/Fluidbook/SearchIndex.php @@ -0,0 +1,176 @@ +pages as $page => $infos) { + $tfile = $compiler->getFluidbook()->getTextFile($page, FluidbookDocument::TEXT_PLAIN); + $ifile = $compiler->getFluidbook()->getTextFile($page, FluidbookDocument::TEXT_INDEX);; + + Gzip::compressIfNotCompressed($tfile); + Gzip::compressIfNotCompressed($ifile); + $text = Gzip::file_get_contents($tfile); + $ipage = Gzip::file_get_contents($ifile); + + if ($simple) { + self::fillIndexWithWordsSimple($index, $page, $ipage); + } else { + self::fillIndexWithWords($index, $page, $ipage); + } + $textes[$page] = $text; + } + ksort($index); + + $textes = json_encode($textes); + $index = json_encode($index); + + Gzip::file_put_contents($tfilec, $textes); + Gzip::file_put_contents($ifilec, $index); + } + + /** + * @param $compiler FluidbookCompiler + * @return array + */ + public static function makeHighlightIndex($compiler): array + { + $res = []; + foreach ($compiler->pages as $page => $infos) { + $fby = $compiler->getFluidbook()->getHightlightFile($page); + $words = json_decode(Gzip::file_get_contents($fby),true); + + if (is_array($words)) { + foreach ($words as $i => $w) { + $word = $w['word']; + $word = trim($word, "\0"); + if ($word == '') { + continue; + } + unset($w['word']); + $w['word'] = $page; + $w['idx'] = $i; + if (!isset($res[$word])) { + $res[$word] = array(); + } + $res[$word][] = $w; + } + } + } + return $res; + } + + protected static function _escapeIndex($str) + { + $todelete = array('\ufffd'); + foreach ($todelete as $d) { + $str = str_replace($d, '', $str); + } + return $str; + } + + protected static function fillIndexWithWordsSimple(&$index, $page, $ipage) + { + $twords = explode("\n", trim($ipage)); + + foreach ($twords as $woadata) { + $w1 = explode(',', trim($woadata)); + if (count($w1) <= 1) { + continue; + } + list($woa, $worddata) = $w1; + $e = explode("\t", $worddata, 2); + if (count($e) < 2) { + continue; + } + list($total, $wordslist) = $e; + + if ($woa == '') { + continue; + } + + if (!isset($index[$woa])) { + $index[$woa] = array('t' => 0, 'p' => array()); + } + $index[$woa]['t'] += (int)$total; + + $words = explode("\t", $wordslist); + foreach ($words as $word) { + list($wordwa, $count) = explode('$', $word, 2); + if (!isset($index[$woa]['p'][$page])) { + $index[$woa]['p'][$page] = 0; + } + $index[$woa]['p'][$page] += (int)$count; + } + } + } + + protected static function fillIndexWithWords(&$index, $page, $ipage) + { + $twords = explode("\n", trim($ipage)); + + foreach ($twords as $woadata) { + $w1 = explode(',', trim($woadata)); + if (count($w1) <= 1) { + continue; + } + list($woa, $worddata) = $w1; + $e = explode("\t", $worddata, 2); + if (count($e) < 2) { + continue; + } + list($total, $wordslist) = $e; + + if ($woa == '') { + continue; + } + + if (!isset($index[$woa])) { + $index[$woa] = array('t' => 0, 'w' => array()); + } + $index[$woa]['t'] += (int)$total; + + $words = explode("\t", $wordslist); + + foreach ($words as $word) { + list($wordwa, $count) = explode('$', $word, 2); + if (!isset($index[$woa]['w'][$wordwa])) { + $index[$woa]['w'][$wordwa] = array('t' => 0, 'p' => array()); + } + if (!isset($index[$woa]['w'][$wordwa]['p'][$page])) { + $index[$woa]['w'][$wordwa]['p'][$page] = 0; + } + $count = (int)$count; + $index[$woa]['w'][$wordwa]['t'] += $count; + $index[$woa]['w'][$wordwa]['p'][$page] += $count; + } + } + } + + +} diff --git a/app/Jobs/FluidbookCompiler.php b/app/Jobs/FluidbookCompiler.php index 0f2c94706..24eec2a8a 100644 --- a/app/Jobs/FluidbookCompiler.php +++ b/app/Jobs/FluidbookCompiler.php @@ -2,6 +2,7 @@ namespace App\Jobs; +use App\Fluidbook\SearchIndex; use App\Fluidbook\SEO\Document; use App\Models\FluidbookPublication; use App\Models\FluidbookTheme; @@ -10,6 +11,7 @@ use App\Models\Traits\FluidbookPlayerBranches; use App\Util\FluidbookLinks; use Cubist\Locale\Country; use Cubist\Locale\Locale; +use Cubist\PDF\CommandLine\FWSTK; use Cubist\Util\ArrayUtil; use Cubist\Util\CommandLine; use Cubist\Util\Data; @@ -575,8 +577,6 @@ class FluidbookCompiler extends Base implements CompilerInterface if (!isset($this->config->basketReferences[$ean])) { continue; } - - $f = $file->getPathname(); if ($ext === 'mp4') { @@ -2800,21 +2800,22 @@ height="0" width="0" style="display:none;visibility:hidden"> public function writeTexts() { - $cache = sha1($this->fluidbookSettings->highlightResults . '/--/' . $this->fluidbookSettings->searchWordSelectionAlgorithm . '///' . $this->fluidbookSettings->textExtraction . '|--|' . $this->fluidbookSettings->ignoreSearchSeparators . '|||' . $this->getFluidbook()->composition_update . '()()()' . filemtime(WS_TOOLS . '/fwstk/out/artifacts/fwstk_jar/fwstk.jar')); - $cacheDir = WS_BOOKS . '/index/' . $this->book_id . '/' . $cache . '/'; - if (!file_exists($cacheDir)) { - mkdir($cacheDir, 0777, true); + $cache = sha1($this->fluidbookSettings->highlightResults . '/--/' . $this->fluidbookSettings->searchWordSelectionAlgorithm . '///' . $this->fluidbookSettings->textExtraction . '|--|' . $this->fluidbookSettings->ignoreSearchSeparators . '|||' . $this->getFluidbook()->composition_update . '()()()' . FWSTK::lastUpdate()); + $cacheDir = Files::mkdir(protected_path('fluidbookpublication/index/' . $this->book_id . '/' . $cache . '/')); + $indexFile = $cacheDir . '/search.index.js'; + $textFile = $cacheDir . '/search.texts.js'; + $hightlightsFile = $cacheDir . '/search.highlight.js'; - $this->daoBook->makeTextsIndexes($this->book, $this->pages, $index, $textes, true); - file_put_contents($cacheDir . '/search.index.js', 'var INDEX=' . $index . ';' . "\r"); - if ($this->fluidbookSettings->highlightResults) { - file_put_contents($cacheDir . '/search.highlight.js', 'var HIGHLIGHTS=' . json_encode($this->daoBook->makeHighlightIndex($this->book, $this->pages)) . ";\r"); - } - if ($this->fluidbookSettings->searchWordSelectionAlgorithm == 'expression') { - file_put_contents($cacheDir . '/search.texts.js', 'var TEXTS=' . $textes . ";\r"); - } + if (!file_exists($indexFile) || !file_exists($textFile)) { + SearchIndex::makeTextsIndexes($this, $cacheDir, $index, $texts, true); + file_put_contents($indexFile, 'var INDEX=' . $index . ';' . "\r"); + file_put_contents($textFile, 'var TEXTS=' . $texts . ";\r"); + } + if ($this->fluidbookSettings->highlightResults && !file_exists($hightlightsFile)) { + file_put_contents($hightlightsFile, 'var HIGHLIGHTS=' . json_encode(SearchIndex::makeHighlightIndex($this)) . ";\r"); } + $this->vdir->copy($cacheDir . '/search.index.js', 'data/search.index.js'); if ($this->fluidbookSettings->highlightResults) { $this->vdir->copy($cacheDir . '/search.highlight.js', 'data/search.highlight.js'); diff --git a/app/Models/FluidbookDocument.php b/app/Models/FluidbookDocument.php index 6d0188ba1..878d490ce 100644 --- a/app/Models/FluidbookDocument.php +++ b/app/Models/FluidbookDocument.php @@ -11,6 +11,7 @@ use Cubist\Backpack\Magic\Fields\Text; use Cubist\Backpack\Magic\Fields\Textarea; use Cubist\PDF\CommandLine\FWSTK; use Cubist\PDF\PDFTools; +use Cubist\Util\Gzip; use Illuminate\Support\Facades\Cache; class FluidbookDocument extends ToolboxModel @@ -294,9 +295,26 @@ class FluidbookDocument extends ToolboxModel public function getTextFile($page, $type = self::TEXT_PLAIN, $extractionMethod = 'fluidbook', $ignoreSeparators = '') { $path = $this->_getTextPath($page, $type, $extractionMethod, $ignoreSeparators); - if (!file_exists($path)) { + if (!$this->_checkTextFile($path)) { PDFTools::extractTexts($this->path('original.pdf'), $this->path(), $extractionMethod, $ignoreSeparators); - if (!file_exists($path)) { + if (!Gzip::file_exists($path)) { + throw new \Exception('An error occured while producing file ' . $path); + } + } + return $path; + } + + protected function _checkTextFile($path) + { + return Gzip::file_exists($path) && Gzip::filemtime($path) >= FWSTK::lastUpdate(); + } + + public function getHightlightFile($page) + { + $path = $this->_getHightlightFilePath($page); + if (!$this->_checkTextFile($path)) { + PDFTools::extractHighlightsData($this->path('original.pdf'), $this->path()); + if (!Gzip::file_exists($path)) { throw new \Exception('An error occured while producing file ' . $path); } } @@ -309,6 +327,11 @@ class FluidbookDocument extends ToolboxModel return $map[$textExtraction]; } + public function _getHightlightFilePath($page) + { + return $this->path('texts/p' . $page . '.fby'); + } + public function _getTextPath($page, $type = self::TEXT_PLAIN, $extractionMethod = 'fluidbook', $ignoreSeparators = '') { $sepFolder = ''; diff --git a/app/Models/FluidbookPublication.php b/app/Models/FluidbookPublication.php index 8b0870548..ff6f166ce 100644 --- a/app/Models/FluidbookPublication.php +++ b/app/Models/FluidbookPublication.php @@ -180,6 +180,12 @@ class FluidbookPublication extends ToolboxSettingsModel return self::_getDocument($compo[0])->getTextFile($compo[1], $type, $this->textExtraction, $this->ignoreSearchSeparators); } + public function getHightlightFile($page) + { + $compo = $this->composition[$page]; + return self::_getDocument($compo[0])->getHightlightFile($compo[1]); + } + public function getFile($page, $format = 'jpg', $resolution = 150, $withText = true, $withGraphics = true, $version = 'html', $force = false) { diff --git a/composer.lock b/composer.lock index 95b33fbfa..c697cd5f9 100644 --- a/composer.lock +++ b/composer.lock @@ -2039,13 +2039,13 @@ "source": { "type": "git", "url": "git://git.cubedesigners.com/cubist_pdf.git", - "reference": "3de1229df9751a411d0c5cb5d85fcd4889a85826" + "reference": "a3712f58ac4d727c1a4c2e7263d39f6b7dce5b3a" }, "dist": { "type": "tar", - "url": "https://composer.cubedesigners.com/dist/cubist/pdf/cubist-pdf-dev-master-ba9dfa.tar", - "reference": "3de1229df9751a411d0c5cb5d85fcd4889a85826", - "shasum": "8ae87c50b61ad69fe6d61390a2280823edccb6b0" + "url": "https://composer.cubedesigners.com/dist/cubist/pdf/cubist-pdf-dev-master-b055ea.tar", + "reference": "a3712f58ac4d727c1a4c2e7263d39f6b7dce5b3a", + "shasum": "67b078a6c6a157a4f32eacedc086cdb7e2bf9f88" }, "require": { "cubist/util": "dev-master", @@ -2081,7 +2081,7 @@ "cubist", "pdf" ], - "time": "2022-08-22T20:03:01+00:00" + "time": "2022-08-23T07:38:41+00:00" }, { "name": "cubist/scorm", @@ -9034,16 +9034,16 @@ }, { "name": "spatie/temporary-directory", - "version": "2.1.0", + "version": "2.1.1", "source": { "type": "git", "url": "https://github.com/spatie/temporary-directory.git", - "reference": "79f138f2b81adae583d04d3727a4538dd394023f" + "reference": "e2818d871783d520b319c2d38dc37c10ecdcde20" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/spatie/temporary-directory/zipball/79f138f2b81adae583d04d3727a4538dd394023f", - "reference": "79f138f2b81adae583d04d3727a4538dd394023f", + "url": "https://api.github.com/repos/spatie/temporary-directory/zipball/e2818d871783d520b319c2d38dc37c10ecdcde20", + "reference": "e2818d871783d520b319c2d38dc37c10ecdcde20", "shasum": "" }, "require": { @@ -9079,7 +9079,7 @@ ], "support": { "issues": "https://github.com/spatie/temporary-directory/issues", - "source": "https://github.com/spatie/temporary-directory/tree/2.1.0" + "source": "https://github.com/spatie/temporary-directory/tree/2.1.1" }, "funding": [ { @@ -9091,7 +9091,7 @@ "type": "github" } ], - "time": "2022-03-11T08:16:01+00:00" + "time": "2022-08-23T07:15:15+00:00" }, { "name": "swayok/alternative-laravel-cache",