From 75604dfcb745bbcefb253ad9db9ef224466d8a9f Mon Sep 17 00:00:00 2001 From: Vincent Vanwaelscappel Date: Tue, 19 Oct 2021 19:43:42 +0200 Subject: [PATCH] wip #4804 @1 --- .idea/cubist_minisearch.iml | 8 ++++++- .idea/php.xml | 8 ++++++- composer.json | 6 +++++- src/Document.php | 13 +++++++++--- src/Document/PDF.php | 42 +++++++++++++++++++++++++++++++++++++ src/Index.php | 2 +- 6 files changed, 72 insertions(+), 7 deletions(-) create mode 100644 src/Document/PDF.php diff --git a/.idea/cubist_minisearch.iml b/.idea/cubist_minisearch.iml index 3bde560..cf5cda5 100644 --- a/.idea/cubist_minisearch.iml +++ b/.idea/cubist_minisearch.iml @@ -42,7 +42,6 @@ - @@ -60,6 +59,13 @@ + + + + + + + diff --git a/.idea/php.xml b/.idea/php.xml index 0bb6ec0..4131144 100644 --- a/.idea/php.xml +++ b/.idea/php.xml @@ -40,7 +40,6 @@ - @@ -58,6 +57,13 @@ + + + + + + + diff --git a/composer.json b/composer.json index 24d810b..7c55158 100644 --- a/composer.json +++ b/composer.json @@ -22,7 +22,11 @@ "require": { "php": ">=7.3.0", "laravel/framework": "^v8.64.0", - "ext-json": "*" + "ext-json": "*", + "cubist/pdf": "dev-master" + }, + "suggest": { + "cubist/pdf": "Allows to support PDF indexation" }, "repositories": [ { diff --git a/src/Document.php b/src/Document.php index dc2651d..203e47b 100644 --- a/src/Document.php +++ b/src/Document.php @@ -132,12 +132,19 @@ class Document } /** - * @return array + * @return array[] */ public function process() { - //TODO - return []; + return [$this->getIndividualDocData()]; + } + + /** + * @return string[] + */ + public function getIndividualDocData() + { + return ['id' => $this->getId(), 'type' => $this->getType(), 'thumb' => $this->getTitle(), 'title' => $this->getTitle(), 'url' => $this->getUrl(), 'text' => $this->getText()]; } } \ No newline at end of file diff --git a/src/Document/PDF.php b/src/Document/PDF.php new file mode 100644 index 0000000..853456e --- /dev/null +++ b/src/Document/PDF.php @@ -0,0 +1,42 @@ +getIndividualDocData(); + + + $doc = new \Cubist\PDF\Document($this->file); + $doc->processFullTexts(); + + $pages = $doc->getPages(); + + $res = []; + for ($i = 1; $i <= $pages; $i++) { + $page = $base; + $page['id'] .= '#' . $i; + $page['url'] .= '#' . $i; + $page['text'] = file_get_contents($doc->getConvertPath() . '/texts/fp' . $i . '.txt'); + $res[] = $page; + } + return $res; + } +} \ No newline at end of file diff --git a/src/Index.php b/src/Index.php index 2a28641..083c4e6 100644 --- a/src/Index.php +++ b/src/Index.php @@ -63,7 +63,7 @@ class Index implements ShouldQueue, ShouldBeUnique { $res = []; foreach ($this->documents as $document) { - $res[] = $document->process(); + $res = array_merge($res,$document->process()); } return 'const documents = ' . json_encode($res) . ';'; } -- 2.39.5