]> _ Git - fluidbook-toolbox.git/commitdiff
wip #5411 @1.5
authorVincent Vanwaelscappel <vincent@cubedesigners.com>
Tue, 23 Aug 2022 15:14:54 +0000 (17:14 +0200)
committerVincent Vanwaelscappel <vincent@cubedesigners.com>
Tue, 23 Aug 2022 15:14:54 +0000 (17:14 +0200)
app/Fluidbook/SearchIndex.php [new file with mode: 0644]
app/Jobs/FluidbookCompiler.php
app/Models/FluidbookDocument.php
app/Models/FluidbookPublication.php
composer.lock

diff --git a/app/Fluidbook/SearchIndex.php b/app/Fluidbook/SearchIndex.php
new file mode 100644 (file)
index 0000000..f752a6e
--- /dev/null
@@ -0,0 +1,176 @@
+<?php
+
+namespace App\Fluidbook;
+
+use App\Jobs\FluidbookCompiler;
+use App\Models\FluidbookDocument;
+use Cubist\PDF\CommandLine\FWSTK;
+use Cubist\Util\Gzip;
+
+class SearchIndex
+{
+    /**
+     * @param $compiler FluidbookCompiler
+     * @param $pages
+     * @param $index
+     * @param $textes
+     * @param bool $simple
+     * @param bool $force
+     * @return void
+     */
+    public static function makeTextsIndexes($compiler, $dir, &$index, &$textes, bool $simple = false, bool $force = false)
+    {
+        if ($simple) {
+            $ifilec = $dir . '/sindex.json';
+        } else {
+            $ifilec = $dir . '/index.json';
+        }
+        $tfilec = $dir . '/textes.json';
+
+        $index = array();
+        $textes = array();
+        foreach ($compiler->pages as $page => $infos) {
+            $tfile = $compiler->getFluidbook()->getTextFile($page, FluidbookDocument::TEXT_PLAIN);
+            $ifile = $compiler->getFluidbook()->getTextFile($page, FluidbookDocument::TEXT_INDEX);;
+
+            Gzip::compressIfNotCompressed($tfile);
+            Gzip::compressIfNotCompressed($ifile);
+            $text = Gzip::file_get_contents($tfile);
+            $ipage = Gzip::file_get_contents($ifile);
+
+            if ($simple) {
+                self::fillIndexWithWordsSimple($index, $page, $ipage);
+            } else {
+                self::fillIndexWithWords($index, $page, $ipage);
+            }
+            $textes[$page] = $text;
+        }
+        ksort($index);
+
+        $textes = json_encode($textes);
+        $index = json_encode($index);
+
+        Gzip::file_put_contents($tfilec, $textes);
+        Gzip::file_put_contents($ifilec, $index);
+    }
+
+    /**
+     * @param $compiler FluidbookCompiler
+     * @return array
+     */
+    public static function makeHighlightIndex($compiler): array
+    {
+        $res = [];
+        foreach ($compiler->pages as $page => $infos) {
+            $fby = $compiler->getFluidbook()->getHightlightFile($page);
+            $words = json_decode(Gzip::file_get_contents($fby),true);
+
+            if (is_array($words)) {
+                foreach ($words as $i => $w) {
+                    $word = $w['word'];
+                    $word = trim($word, "\0");
+                    if ($word == '') {
+                        continue;
+                    }
+                    unset($w['word']);
+                    $w['word'] = $page;
+                    $w['idx'] = $i;
+                    if (!isset($res[$word])) {
+                        $res[$word] = array();
+                    }
+                    $res[$word][] = $w;
+                }
+            }
+        }
+        return $res;
+    }
+
+    protected static function _escapeIndex($str)
+    {
+        $todelete = array('\ufffd');
+        foreach ($todelete as $d) {
+            $str = str_replace($d, '', $str);
+        }
+        return $str;
+    }
+
+    protected static function fillIndexWithWordsSimple(&$index, $page, $ipage)
+    {
+        $twords = explode("\n", trim($ipage));
+
+        foreach ($twords as $woadata) {
+            $w1 = explode(',', trim($woadata));
+            if (count($w1) <= 1) {
+                continue;
+            }
+            list($woa, $worddata) = $w1;
+            $e = explode("\t", $worddata, 2);
+            if (count($e) < 2) {
+                continue;
+            }
+            list($total, $wordslist) = $e;
+
+            if ($woa == '') {
+                continue;
+            }
+
+            if (!isset($index[$woa])) {
+                $index[$woa] = array('t' => 0, 'p' => array());
+            }
+            $index[$woa]['t'] += (int)$total;
+
+            $words = explode("\t", $wordslist);
+            foreach ($words as $word) {
+                list($wordwa, $count) = explode('$', $word, 2);
+                if (!isset($index[$woa]['p'][$page])) {
+                    $index[$woa]['p'][$page] = 0;
+                }
+                $index[$woa]['p'][$page] += (int)$count;
+            }
+        }
+    }
+
+    protected static function fillIndexWithWords(&$index, $page, $ipage)
+    {
+        $twords = explode("\n", trim($ipage));
+
+        foreach ($twords as $woadata) {
+            $w1 = explode(',', trim($woadata));
+            if (count($w1) <= 1) {
+                continue;
+            }
+            list($woa, $worddata) = $w1;
+            $e = explode("\t", $worddata, 2);
+            if (count($e) < 2) {
+                continue;
+            }
+            list($total, $wordslist) = $e;
+
+            if ($woa == '') {
+                continue;
+            }
+
+            if (!isset($index[$woa])) {
+                $index[$woa] = array('t' => 0, 'w' => array());
+            }
+            $index[$woa]['t'] += (int)$total;
+
+            $words = explode("\t", $wordslist);
+
+            foreach ($words as $word) {
+                list($wordwa, $count) = explode('$', $word, 2);
+                if (!isset($index[$woa]['w'][$wordwa])) {
+                    $index[$woa]['w'][$wordwa] = array('t' => 0, 'p' => array());
+                }
+                if (!isset($index[$woa]['w'][$wordwa]['p'][$page])) {
+                    $index[$woa]['w'][$wordwa]['p'][$page] = 0;
+                }
+                $count = (int)$count;
+                $index[$woa]['w'][$wordwa]['t'] += $count;
+                $index[$woa]['w'][$wordwa]['p'][$page] += $count;
+            }
+        }
+    }
+
+
+}
index 0f2c9470682c9c4ae87318a05e5cac0a4e8950b4..24eec2a8aa5afb572db43fde16b088ceb46a0c75 100644 (file)
@@ -2,6 +2,7 @@
 
 namespace App\Jobs;
 
+use App\Fluidbook\SearchIndex;
 use App\Fluidbook\SEO\Document;
 use App\Models\FluidbookPublication;
 use App\Models\FluidbookTheme;
@@ -10,6 +11,7 @@ use App\Models\Traits\FluidbookPlayerBranches;
 use App\Util\FluidbookLinks;
 use Cubist\Locale\Country;
 use Cubist\Locale\Locale;
+use Cubist\PDF\CommandLine\FWSTK;
 use Cubist\Util\ArrayUtil;
 use Cubist\Util\CommandLine;
 use Cubist\Util\Data;
@@ -575,8 +577,6 @@ class FluidbookCompiler extends Base implements CompilerInterface
             if (!isset($this->config->basketReferences[$ean])) {
                 continue;
             }
-
-
             $f = $file->getPathname();
 
             if ($ext === 'mp4') {
@@ -2800,21 +2800,22 @@ height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
 
     public function writeTexts()
     {
-        $cache = sha1($this->fluidbookSettings->highlightResults . '/--/' . $this->fluidbookSettings->searchWordSelectionAlgorithm . '///' . $this->fluidbookSettings->textExtraction . '|--|' . $this->fluidbookSettings->ignoreSearchSeparators . '|||' . $this->getFluidbook()->composition_update . '()()()' . filemtime(WS_TOOLS . '/fwstk/out/artifacts/fwstk_jar/fwstk.jar'));
-        $cacheDir = WS_BOOKS . '/index/' . $this->book_id . '/' . $cache . '/';
-        if (!file_exists($cacheDir)) {
-            mkdir($cacheDir, 0777, true);
+        $cache = sha1($this->fluidbookSettings->highlightResults . '/--/' . $this->fluidbookSettings->searchWordSelectionAlgorithm . '///' . $this->fluidbookSettings->textExtraction . '|--|' . $this->fluidbookSettings->ignoreSearchSeparators . '|||' . $this->getFluidbook()->composition_update . '()()()' . FWSTK::lastUpdate());
+        $cacheDir = Files::mkdir(protected_path('fluidbookpublication/index/' . $this->book_id . '/' . $cache . '/'));
+        $indexFile = $cacheDir . '/search.index.js';
+        $textFile = $cacheDir . '/search.texts.js';
+        $hightlightsFile = $cacheDir . '/search.highlight.js';
 
-            $this->daoBook->makeTextsIndexes($this->book, $this->pages, $index, $textes, true);
-            file_put_contents($cacheDir . '/search.index.js', 'var INDEX=' . $index . ';' . "\r");
-            if ($this->fluidbookSettings->highlightResults) {
-                file_put_contents($cacheDir . '/search.highlight.js', 'var HIGHLIGHTS=' . json_encode($this->daoBook->makeHighlightIndex($this->book, $this->pages)) . ";\r");
-            }
-            if ($this->fluidbookSettings->searchWordSelectionAlgorithm == 'expression') {
-                file_put_contents($cacheDir . '/search.texts.js', 'var TEXTS=' . $textes . ";\r");
-            }
+        if (!file_exists($indexFile) || !file_exists($textFile)) {
+            SearchIndex::makeTextsIndexes($this, $cacheDir, $index, $texts, true);
+            file_put_contents($indexFile, 'var INDEX=' . $index . ';' . "\r");
+            file_put_contents($textFile, 'var TEXTS=' . $texts . ";\r");
+        }
+        if ($this->fluidbookSettings->highlightResults && !file_exists($hightlightsFile)) {
+            file_put_contents($hightlightsFile, 'var HIGHLIGHTS=' . json_encode(SearchIndex::makeHighlightIndex($this)) . ";\r");
         }
 
+
         $this->vdir->copy($cacheDir . '/search.index.js', 'data/search.index.js');
         if ($this->fluidbookSettings->highlightResults) {
             $this->vdir->copy($cacheDir . '/search.highlight.js', 'data/search.highlight.js');
index 6d0188ba129091813f085ba0d1dce01da705781b..878d490ce15f0f6b5c5650ab7e173153d00efd5c 100644 (file)
@@ -11,6 +11,7 @@ use Cubist\Backpack\Magic\Fields\Text;
 use Cubist\Backpack\Magic\Fields\Textarea;
 use Cubist\PDF\CommandLine\FWSTK;
 use Cubist\PDF\PDFTools;
+use Cubist\Util\Gzip;
 use Illuminate\Support\Facades\Cache;
 
 class FluidbookDocument extends ToolboxModel
@@ -294,9 +295,26 @@ class FluidbookDocument extends ToolboxModel
     public function getTextFile($page, $type = self::TEXT_PLAIN, $extractionMethod = 'fluidbook', $ignoreSeparators = '')
     {
         $path = $this->_getTextPath($page, $type, $extractionMethod, $ignoreSeparators);
-        if (!file_exists($path)) {
+        if (!$this->_checkTextFile($path)) {
             PDFTools::extractTexts($this->path('original.pdf'), $this->path(), $extractionMethod, $ignoreSeparators);
-            if (!file_exists($path)) {
+            if (!Gzip::file_exists($path)) {
+                throw new \Exception('An error occured while producing file ' . $path);
+            }
+        }
+        return $path;
+    }
+
+    protected function _checkTextFile($path)
+    {
+        return Gzip::file_exists($path) && Gzip::filemtime($path) >= FWSTK::lastUpdate();
+    }
+
+    public function getHightlightFile($page)
+    {
+        $path = $this->_getHightlightFilePath($page);
+        if (!$this->_checkTextFile($path)) {
+            PDFTools::extractHighlightsData($this->path('original.pdf'), $this->path());
+            if (!Gzip::file_exists($path)) {
                 throw new \Exception('An error occured while producing file ' . $path);
             }
         }
@@ -309,6 +327,11 @@ class FluidbookDocument extends ToolboxModel
         return $map[$textExtraction];
     }
 
+    public function _getHightlightFilePath($page)
+    {
+        return $this->path('texts/p' . $page . '.fby');
+    }
+
     public function _getTextPath($page, $type = self::TEXT_PLAIN, $extractionMethod = 'fluidbook', $ignoreSeparators = '')
     {
         $sepFolder = '';
index 8b087054812cd0e86558302bc88c587466ae0fdb..ff6f166ce003546b328490134dc6915c288e98b0 100644 (file)
@@ -180,6 +180,12 @@ class FluidbookPublication extends ToolboxSettingsModel
         return self::_getDocument($compo[0])->getTextFile($compo[1], $type, $this->textExtraction, $this->ignoreSearchSeparators);
     }
 
+    public function getHightlightFile($page)
+    {
+        $compo = $this->composition[$page];
+        return self::_getDocument($compo[0])->getHightlightFile($compo[1]);
+    }
+
 
     public function getFile($page, $format = 'jpg', $resolution = 150, $withText = true, $withGraphics = true, $version = 'html', $force = false)
     {
index 95b33fbfaf699c4d92eb208a6168696869a0a55f..c697cd5f96f56c6684474034a16622f51f234452 100644 (file)
             "source": {
                 "type": "git",
                 "url": "git://git.cubedesigners.com/cubist_pdf.git",
-                "reference": "3de1229df9751a411d0c5cb5d85fcd4889a85826"
+                "reference": "a3712f58ac4d727c1a4c2e7263d39f6b7dce5b3a"
             },
             "dist": {
                 "type": "tar",
-                "url": "https://composer.cubedesigners.com/dist/cubist/pdf/cubist-pdf-dev-master-ba9dfa.tar",
-                "reference": "3de1229df9751a411d0c5cb5d85fcd4889a85826",
-                "shasum": "8ae87c50b61ad69fe6d61390a2280823edccb6b0"
+                "url": "https://composer.cubedesigners.com/dist/cubist/pdf/cubist-pdf-dev-master-b055ea.tar",
+                "reference": "a3712f58ac4d727c1a4c2e7263d39f6b7dce5b3a",
+                "shasum": "67b078a6c6a157a4f32eacedc086cdb7e2bf9f88"
             },
             "require": {
                 "cubist/util": "dev-master",
                 "cubist",
                 "pdf"
             ],
-            "time": "2022-08-22T20:03:01+00:00"
+            "time": "2022-08-23T07:38:41+00:00"
         },
         {
             "name": "cubist/scorm",
         },
         {
             "name": "spatie/temporary-directory",
-            "version": "2.1.0",
+            "version": "2.1.1",
             "source": {
                 "type": "git",
                 "url": "https://github.com/spatie/temporary-directory.git",
-                "reference": "79f138f2b81adae583d04d3727a4538dd394023f"
+                "reference": "e2818d871783d520b319c2d38dc37c10ecdcde20"
             },
             "dist": {
                 "type": "zip",
-                "url": "https://api.github.com/repos/spatie/temporary-directory/zipball/79f138f2b81adae583d04d3727a4538dd394023f",
-                "reference": "79f138f2b81adae583d04d3727a4538dd394023f",
+                "url": "https://api.github.com/repos/spatie/temporary-directory/zipball/e2818d871783d520b319c2d38dc37c10ecdcde20",
+                "reference": "e2818d871783d520b319c2d38dc37c10ecdcde20",
                 "shasum": ""
             },
             "require": {
             ],
             "support": {
                 "issues": "https://github.com/spatie/temporary-directory/issues",
-                "source": "https://github.com/spatie/temporary-directory/tree/2.1.0"
+                "source": "https://github.com/spatie/temporary-directory/tree/2.1.1"
             },
             "funding": [
                 {
                     "type": "github"
                 }
             ],
-            "time": "2022-03-11T08:16:01+00:00"
+            "time": "2022-08-23T07:15:15+00:00"
         },
         {
             "name": "swayok/alternative-laravel-cache",