From 498f8d851bf7ffeec43ec8838c33574d09b3e502 Mon Sep 17 00:00:00 2001 From: Vincent Vanwaelscappel Date: Wed, 18 Mar 2020 18:43:43 +0100 Subject: [PATCH] wip #3501 @0.5 --- src/app/Magic/Search.php | 158 +++++++++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 58 deletions(-) diff --git a/src/app/Magic/Search.php b/src/app/Magic/Search.php index 70e5bd2..b3b7a23 100644 --- a/src/app/Magic/Search.php +++ b/src/app/Magic/Search.php @@ -10,8 +10,8 @@ use Cviebrock\LaravelElasticsearch\Facade as Elasticsearch; class Search { - protected static $_mappings = ['fr' => - [ + protected static $_mappings = [ + 'fr' => [ 'type' => 'text', 'analyzer' => 'french_light', 'fields' => [ @@ -33,7 +33,13 @@ class Search ], 'de' => [ 'type' => 'text', - 'analyser' => 'german' + 'analyzer' => 'german_light', + 'fields' => [ + 'stemmed' => [ + 'type' => 'text', + 'analyzer' => 'german_heavy' + ] + ] ], 'es' => [ 'type' => 'text', @@ -51,79 +57,113 @@ class Search protected static $_analysis = ['fr' => [ - "filter" => [ - "french_elision" => [ - "type" => "elision", - "articles_case" => true, - "articles" => ["l", "m", "t", "qu", "n", "s", "j", "d", "c", "jusqu", "quoiqu", "lorsqu", "puisqu"] + 'filter' => [ + 'french_elision' => [ + 'type' => 'elision', + 'articles_case' => true, + 'articles' => ['l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu'] ], - "french_synonym" => [ - "type" => "synonym", - "ignore_case" => true, - "expand" => true, - "synonyms" => [ - "salade, laitue", - "mayo, mayonnaise", - "grille, toaste", + 'french_synonym' => [ + 'type' => 'synonym', + 'ignore_case' => true, + 'expand' => true, + 'synonyms' => [ + 'salade, laitue', + 'mayo, mayonnaise', + 'grille, toaste', 'pmi, pm instrumentation', ] ], - "french_stemmer" => [ - "type" => "stemmer", - "language" => "light_french" + 'french_stemmer' => [ + 'type' => 'stemmer', + 'language' => 'light_french' ] ], - "analyzer" => [ - "french_heavy" => [ - "tokenizer" => "icu_tokenizer", - "char_filter" => ["html_strip"], - "filter" => [ - "french_elision", - "icu_folding", - "french_synonym", - "french_stemmer" + 'analyzer' => [ + 'french_heavy' => [ + 'tokenizer' => 'icu_tokenizer', + 'char_filter' => ['html_strip'], + 'filter' => [ + 'french_elision', + 'icu_folding', + 'french_synonym', + 'french_stemmer' ] ], - "french_light" => [ - "tokenizer" => "icu_tokenizer", - "char_filter" => ["html_strip"], - "filter" => [ - "french_elision", - "icu_folding" + 'french_light' => [ + 'tokenizer' => 'icu_tokenizer', + 'char_filter' => ['html_strip'], + 'filter' => [ + 'french_elision', + 'icu_folding' ] ] ] ], 'en' => [ - "filter" => [ - "english_synonym" => [ - "type" => "synonym", - "ignore_case" => true, - "expand" => true, - "synonyms" => [ + 'filter' => [ + 'english_synonym' => [ + 'type' => 'synonym', + 'ignore_case' => true, + 'expand' => true, + 'synonyms' => [ 'pmi, pm instrumentation', ] ], - "english_stemmer" => [ - "type" => "stemmer", - "language" => "light_english" + 'english_stemmer' => [ + 'type' => 'stemmer', + 'language' => 'light_english' ] ], - "analyzer" => [ - "english_heavy" => [ - "tokenizer" => "icu_tokenizer", - "char_filter" => ["html_strip"], - "filter" => [ - "icu_folding", - "english_synonym", - "english_stemmer" + 'analyzer' => [ + 'english_heavy' => [ + 'tokenizer' => 'icu_tokenizer', + 'char_filter' => ['html_strip'], + 'filter' => [ + 'icu_folding', + 'english_synonym', + 'english_stemmer' ] ], - "english_light" => [ - "tokenizer" => "icu_tokenizer", - "char_filter" => ["html_strip"], - "filter" => [ - "icu_folding" + 'english_light' => [ + 'tokenizer' => 'icu_tokenizer', + 'char_filter' => ['html_strip'], + 'filter' => [ + 'icu_folding' + ] + ] + ] + ], + 'de' => [ + 'filter' => [ + 'german_synonym' => [ + 'type' => 'synonym', + 'ignore_case' => true, + 'expand' => true, + 'synonyms' => [ + 'pmi, pm instrumentation', + ] + ], + 'german_stemmer' => [ + 'type' => 'stemmer', + 'language' => 'light_german' + ] + ], + 'analyzer' => [ + 'german_heavy' => [ + 'tokenizer' => 'icu_tokenizer', + 'char_filter' => ['html_strip'], + 'filter' => [ + 'icu_folding', + 'german_synonym', + 'german_stemmer' + ] + ], + 'german_light' => [ + 'tokenizer' => 'icu_tokenizer', + 'char_filter' => ['html_strip'], + 'filter' => [ + 'icu_folding' ] ] ] @@ -158,6 +198,7 @@ class Search { $index = self::_getIndexKey($locale, $variant); + try { Elasticsearch::indices()->delete(['index' => $index]); } catch (\Exception $e) { @@ -213,13 +254,14 @@ class Search 'main' => $doc->getDOM()->saveHTML($doc->select('main', false)->item(0)), 'breadcrumbs' => $breadcrumbs, 'type' => $type, + 'url' => $url, ]; $data = [ 'body' => $body, 'index' => $index, 'type' => '_doc', - 'id' => $url, + 'id' => hash('sha256', $url), ]; echo 'Indexing ' . $variant . ' - ' . $locale . ' | ' . $url . "\n"; @@ -319,7 +361,7 @@ class Search $hits = []; foreach ($res['hits']['hits'] as $hit) { - $hits[] = ['url' => $hit['_id'], + $hits[] = ['url' => $hit['_source']['url'], 'title' => $hit['_source']['short_title'], 'breadcrumbs' => json_decode($hit['_source']['breadcrumbs']), ]; -- 2.39.5