From: Vincent Vanwaelscappel Date: Tue, 14 Sep 2021 13:59:23 +0000 (+0200) Subject: wip #4666 @0.5 X-Git-Url: http://git.cubedesigners.com/?a=commitdiff_plain;h=d9b567505c62a93386dcb2b83cbb7825cf57a043;p=fluidbook_tools.git wip #4666 @0.5 --- diff --git a/src/Search/Page.php b/src/Search/Page.php new file mode 100644 index 0000000..535d9f0 --- /dev/null +++ b/src/Search/Page.php @@ -0,0 +1,35 @@ +_documentPage = $documentPage; + $this->_path = $path; + } + + /** + * @return array + * @throws JsonException + */ + public function getHighlights(): array + { + return json_decode(file_get_contents($this->_path . '/p' . $this->_documentPage . '.fby'), false, 512, JSON_THROW_ON_ERROR); + } + + /** + * @return string + */ + public function getIndex(): string + { + return trim(file_get_contents($this->_path . '/fi' . $this->_documentPage . '.txt')); + } +} diff --git a/src/Search/SearchIndex.php b/src/Search/SearchIndex.php index afe9759..b900662 100644 --- a/src/Search/SearchIndex.php +++ b/src/Search/SearchIndex.php @@ -2,17 +2,94 @@ namespace Fluidbook\Tools\Search; +use JsonException; +use stdClass; + class SearchIndex { + /** + * @var Page[] + */ protected $_pages = []; - public function __construct() + public function addPage($page, $documentPage, $path) { + $this->_pages[$page] = new Page($documentPage, $path); + } + + /** + * @return array + */ + public function compileIndex() + { + $index = []; + foreach ($this->_pages as $pageNumber => $page) { + $twords = explode("\n", $page->getIndex()); + + foreach ($twords as $woadata) { + $w1 = explode(',', trim($woadata)); + if (count($w1) <= 1) { + continue; + } + list($woa, $worddata) = $w1; + $e = explode("\t", $worddata, 2); + if (count($e) < 2) { + continue; + } + list($total, $wordslist) = $e; + if ($woa === '') { + continue; + } + + if (!isset($index[$woa])) { + $index[$woa] = array('t' => 0, 'w' => array()); + } + $index[$woa]['t'] += (int)$total; + + $words = explode("\t", $wordslist); + + foreach ($words as $word) { + list($wordwa, $count) = explode('$', $word, 2); + if (!isset($index[$woa]['w'][$wordwa])) { + $index[$woa]['w'][$wordwa] = array('t' => 0, 'p' => [$pageNumber => 0]); + } + if (!isset($index[$woa]['w'][$wordwa]['p'][$pageNumber])) { + $index[$woa]['w'][$wordwa]['p'][$pageNumber] = 0; + } + $index[$woa]['w'][$wordwa]['t'] += (int)$count; + $index[$woa]['w'][$wordwa]['p'][$pageNumber] += (int)$count; + } + } + } + return $index; } - public function addPage($page, $path) + + /** + * @return stdClass + * @throws JsonException + */ + public function compileHighlights() { - $this->_pages[$page] = $path; + $res = new stdClass(); + foreach ($this->_pages as $pageNumber => $page) { + $words = $page->getHighlights(); + foreach ($words as $i => $w) { + $word = $w->word; + $word = trim($word, "\0"); + if ($word === '') { + continue; + } + unset($w->word); + $w->page = $pageNumber; + $w->idx = $i; + if (!isset($res->{$word})) { + $res->{$word} = array(); + } + $res->{$word}[] = $w; + } + } + return $res; } }